In [52]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim import Adam, SGD
from transformers import BertModel
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [53]:
train_data_path = 'downloads/train_set.csv'
test_data_path = 'downloads/test_a.csv'
train_data = pd.read_csv(train_data_path, sep='\t', encoding='UTF-8')
test_data = pd.read_csv(test_data_path, sep='\t', encoding='UTF-8')

# Data analysis

## 训练集数据

In [54]:
train_data.head()

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


## lable 数据类型

In [55]:
type(train_data["label"].iloc[0])

numpy.int64

## text 数据格式

In [56]:
train_data["text"].iloc[0]

'2967 6758 339 2021 1854 3731 4109 3792 4149 1519 2058 3912 2465 2410 1219 6654 7539 264 2456 4811 1292 2109 6905 5520 7058 6045 3634 6591 3530 6508 2465 7044 1519 3659 2073 3750 3731 4109 3792 6831 2614 3370 4269 3370 486 5770 4109 4125 3750 5445 2466 6831 6758 3743 3630 1726 2313 5906 826 4516 657 900 1871 7044 3750 2967 3731 1757 1939 648 2828 4704 7039 3706 3750 965 2490 7399 3743 2145 2407 7451 3775 6017 5998 1641 299 4704 2621 7029 3056 6333 433 648 1667 1099 900 2289 1099 648 5780 220 7044 1279 7426 4269 3750 2967 6758 6631 3099 2205 7305 2620 5977 3750 3329 1793 6666 2042 3193 4149 1519 7039 3706 2446 5399 648 4124 2058 3912 248 3193 2252 5649 2212 4939 7239 3310 4525 2400 900 5770 4109 4125 7044 4921 265 1397 4699 1699 669 6407 3750 1271 1271 4741 669 4659 3870 4030 4167 5338 25 3466 6909 4417 1859 3750 1465 7194 648 3938 1571 848 6986 827 2124 3750 1991 7444 7037 2729 908 6308 3750 1889 6810 4190 591 5598 2289 2109 6831 6407 2400 5410 517 900 25 3731 4109 3792 4128 1679 4811 

## 字表长度

In [57]:
max_char=0
for i in range(len(train_data)):
    text_ls=train_data["text"].iloc[i].split()
    for num in text_ls:
        if int(num) > max_char:
            max_char = int(num)
print(max_char+1)

7550


## text数据长度

In [58]:
text_len_arr=np.array([len(x.split()) for x in train_data["text"].values])

In [59]:
print("最大文本长度：",np.max(text_len_arr))
print("最小文本长度：",np.min(text_len_arr))
print("平均文本长度：",np.mean(text_len_arr))
print("文本长度中位数：",np.median(text_len_arr))

最大文本长度： 57921
最小文本长度： 2
平均文本长度： 907.20711
文本长度中位数： 676.0


## 类别样本数统计

In [60]:
train_data["label"].value_counts()

label
0     38918
1     36945
2     31425
3     22133
4     15016
5     12232
6      9985
7      8841
8      7847
9      5878
10     4920
11     3131
12     1821
13      908
Name: count, dtype: int64

In [61]:
test_data.head()

Unnamed: 0,text
0,5399 3117 1070 4321 4568 2621 5466 3772 4516 2...
1,2491 4109 1757 7539 648 3695 3038 4490 23 7019...
2,2673 5076 6835 2835 5948 5677 3247 4124 2465 5...
3,4562 4893 2210 4761 3659 1324 2595 5949 4583 2...
4,4269 7134 2614 1724 4464 1324 3370 3370 2106 2...


## 每个类别按比例拆分训练集和测试集

In [62]:
random.randint(0,10)

6

In [63]:
test_size=0.1
train_data_ls=[]
test_data_ls=[]
for label in set(train_data["label"].values):
    sub_data=train_data[train_data["label"]==label]
    sub_data_train,sub_data_test=train_test_split(sub_data,test_size=test_size)
    train_data_ls.append(sub_data_train)
    test_data_ls.append(sub_data_test)
train_data_concat=pd.concat(train_data_ls,axis=0)
test_data_concat=pd.concat(test_data_ls,axis=0)
print(len(train_data_concat))
print(len(test_data_concat))
train_data_concat.to_csv("datasets/train_data.csv",index=False)
test_data_concat.to_csv("datasets/valid_data.csv",index=False)    

179993
20007


# Config

In [71]:
config={"max_length":2000,
        "batch_size":128,
        "train_data_path":'downloads/132889/train_set.csv',
        "class_num":14,
        "num_layers":2,
        "hidden_size":512,
        "pooling_style":"max",
        "train_data_path":"datasets/train_data.csv",
        "valid_data_path":"datasets/valid_data.csv",
        "test_data_path":"downloads/test_a.csv",
        "model_path":"model/model.pth",
        "optimizer":"adam",
        "learning_rate":0.001,
        "epoch":3,
        "kernel_size": 3,
        "vocab_size":7550,
       }

## Loader

In [72]:
class DataGenerator:
    def __init__(self, data_path, config):
        self.data_path = data_path
        self.config = config
        self.load()

    def load(self):
        self.data = []
        df = pd.read_csv(self.data_path)
        for i in range(len(df)):
            sequence=[int(x) for x in df["text"].iloc[i].split()]
            encode_x = torch.LongTensor(padding(sequence,config))
            encode_y= torch.LongTensor([df["label"].iloc[i]])
            self.data.append([encode_x,encode_y])
        return

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]



def padding(input_sequence, config):
    input_sequence = input_sequence[:config["max_length"]]
    input_sequence += [0] * (config["max_length"] - len(input_sequence))
    return input_sequence


def load_data(data_path, config, shuffle=True):
    dg = DataGenerator(data_path, config)
    dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)
    return dl

In [73]:
dg = DataGenerator(config["train_data_path"], config)
print(dg[0])

[tensor([4118, 5176, 4559,  ...,    0,    0,    0]), tensor([0])]


## Model

In [74]:
class CNN(nn.Module):
    def __init__(self, config):
        super(CNN, self).__init__()
        hidden_size = config["hidden_size"]  # 输入输出通道数
        kernel_size = config["kernel_size"]  # 卷积核尺寸
        pad = int((kernel_size - 1) / 2)  # 添加pad保证卷积前后维度相同
        self.cnn = nn.Conv1d(hidden_size, hidden_size, kernel_size, bias=False, padding=pad)

    def forward(self, x):  # x : (batch_size, max_len, embeding_size)
        return self.cnn(x.transpose(1, 2)).transpose(1, 2)
    
class GatedCNN(nn.Module):
    def __init__(self, config):
        super(GatedCNN, self).__init__()
        self.cnn = CNN(config)
        self.gate = CNN(config)

    def forward(self, x):
        a = self.cnn(x)
        b = self.gate(x)
        b = torch.sigmoid(b)
        return torch.mul(a, b)  # 逐个元素相乘，门控的作用相当于给每个元素添加权重
    
class StackGatedCNN(nn.Module):
    def __init__(self, config):
        super(StackGatedCNN, self).__init__()
        self.num_layers = config["num_layers"]
        self.hidden_size = config["hidden_size"]
        # ModuleList类内可以放置多个模型，取用时类似于一个列表
        self.gcnn_layers = nn.ModuleList(
            GatedCNN(config) for i in range(self.num_layers)
        )
        self.ff_liner_layers1 = nn.ModuleList(
            nn.Linear(self.hidden_size, self.hidden_size) for i in range(self.num_layers)
        )
        self.ff_liner_layers2 = nn.ModuleList(
            nn.Linear(self.hidden_size, self.hidden_size) for i in range(self.num_layers)
        )
        self.bn_after_gcnn = nn.ModuleList(
            nn.LayerNorm(self.hidden_size) for i in range(self.num_layers)
        )
        self.bn_after_ff = nn.ModuleList(
            nn.LayerNorm(self.hidden_size) for i in range(self.num_layers)
        )
    def forward(self, x):
        # 仿照bert的transformer模型结构，将self-attention替换为gcnn
        for i in range(self.num_layers):
            gcnn_x = self.gcnn_layers[i](x)
            x = gcnn_x + x  # 通过gcnn+残差
            x = self.bn_after_gcnn[i](x)  # 之后bn
            # # 仿照feed-forward层，使用两个线性层
            l1 = self.ff_liner_layers1[i](x)  # 一层线性
            l1 = torch.relu(l1)  # 在bert中这里是gelu
            l2 = self.ff_liner_layers2[i](l1)  # 二层线性
            x = self.bn_after_ff[i](x + l2)  # 残差后过bn
        return x
    
class TorchModel(nn.Module):
    def __init__(self, config):
        super(TorchModel, self).__init__()
        hidden_size = config["hidden_size"]
        class_num = config["class_num"]
        num_layers = config["num_layers"]
        vocab_size = config["vocab_size"]
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.encoder = StackGatedCNN(config)
        self.classify = nn.Linear(hidden_size, class_num)
        self.pooling_style = config["pooling_style"]
        self.loss = nn.functional.cross_entropy
    
    def forward(self, x, target=None):
        x = self.embedding(x)
        x = self.encoder(x)
        if isinstance(x, tuple):
            x = x[0]
        if self.pooling_style == "max":
            self.pooling_layer = nn.MaxPool1d(x.shape[1])
        else:
            self.pooling_layer = nn.AvgPool1d(x.shape[1])
        x = self.pooling_layer(x.transpose(1, 2)).squeeze()
        predict = self.classify(x)
        if target is not None:
            return self.loss(predict, target.squeeze())
        else:
            return predict
        
def choose_optimizer(config, model):
    optimizer = config["optimizer"]
    learning_rate = config["learning_rate"]
    if optimizer == "adam":
        return Adam(model.parameters(), lr=learning_rate)
    elif optimizer == "sgd":
        return SGD(model.parameters(), lr=learning_rate)
        

# Evaluate

In [75]:
class Evaluator:
    def __init__(self,config,model):
        self.config=config
        self.model=model
        self.valid_data=load_data(self.config["valid_data_path"],config)
        self.state_dic={"correct":0,"wrong":0}

    def eval(self,epoch):
        print("开始测试第%d轮模型效果：" % epoch)
        self.model.eval()
        self.state_dic={"correct":0,"wrong":0}
        for index,batch_data in enumerate(self.valid_data):
            if torch.cuda.is_available():
                batch_data=[b.cuda() for b in batch_data]
            input_ids,labels=batch_data
            with torch.no_grad():
                pred_result=self.model(input_ids)
            self.write_stats(labels,pred_result)
        acc=self.show_stats()
        return acc

    def write_stats(self,labels,pred_result):
        assert  len(labels)==len(pred_result)
        for true_label,pred_label in zip(labels,pred_result):
            pred_label=torch.argmax(pred_label)
            if int(true_label)==int(pred_label):
                self.state_dic["correct"]+=1
            else:
                self.state_dic["wrong"]+=1
        return

    def show_stats(self):
        correct=self.state_dic["correct"]
        wrong=self.state_dic["wrong"]
        print("预测集合条目总量：%d" % (correct + wrong))
        print("预测正确条目：%d，预测错误条目：%d" % (correct, wrong))
        print("预测准确率：%f" % (correct / (correct + wrong)))
        print("--------------------")
        return correct / (correct + wrong)

# Main

In [76]:
def main(config):
    # 加载训练数据
    train_data = load_data(config["train_data_path"], config)
    # 加载模型
    model = TorchModel(config)
    # 判断GPU是否可用
    cuda_flag = torch.cuda.is_available()
    if cuda_flag:
        print("设备GPU可用，迁移模型至GPU")
        model = model.cuda()
    # 加载优化器
    optimizer = choose_optimizer(config, model)
    # 加载模型训练效果
    evaluator = Evaluator(config, model)
    # 训练
    for epoch in range(config["epoch"]):
        model.train()
        print("epoch %d begin" % epoch)
        train_loss = []
        for index, batch_data in tqdm(enumerate(train_data)):
            if cuda_flag:
                batch_data = [d.cuda() for d in batch_data]
            optimizer.zero_grad()
            input_ids, labels = batch_data
            loss = model(input_ids, labels)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            if index % 200 == 0:
                print("batch loss %f" % loss)
        print("epoch average loss: %f" % np.mean(train_loss))
        acc = evaluator.eval(epoch)
    torch.save(model.state_dict(), config["model_path"])
    return acc

In [77]:
acc = main(config)

设备GPU可用，迁移模型至GPU
epoch 0 begin


1it [00:00,  1.25it/s]

batch loss 3.911251


201it [02:17,  1.46it/s]

batch loss 0.234336


401it [04:35,  1.45it/s]

batch loss 0.248294


601it [06:53,  1.44it/s]

batch loss 0.226673


801it [09:13,  1.42it/s]

batch loss 0.157625


1001it [11:33,  1.43it/s]

batch loss 0.136669


1201it [13:54,  1.43it/s]

batch loss 0.165293


1401it [16:13,  1.45it/s]

batch loss 0.094761


1407it [16:17,  1.44it/s]


epoch average loss: 0.269014
开始测试第0轮模型效果：
预测集合条目总量：20007
预测正确条目：18709，预测错误条目：1298
预测准确率：0.935123
--------------------
epoch 1 begin


1it [00:00,  1.43it/s]

batch loss 0.195534


201it [02:22,  1.40it/s]

batch loss 0.102615


401it [04:44,  1.42it/s]

batch loss 0.114018


601it [07:02,  1.45it/s]

batch loss 0.139019


801it [09:20,  1.45it/s]

batch loss 0.151046


1001it [11:41,  1.40it/s]

batch loss 0.064055


1201it [14:02,  1.41it/s]

batch loss 0.136434


1401it [16:23,  1.42it/s]

batch loss 0.354727


1407it [16:27,  1.42it/s]


epoch average loss: 0.154282
开始测试第1轮模型效果：
预测集合条目总量：20007
预测正确条目：18853，预测错误条目：1154
预测准确率：0.942320
--------------------
epoch 2 begin


1it [00:00,  1.40it/s]

batch loss 0.093485


201it [02:20,  1.45it/s]

batch loss 0.055522


401it [04:38,  1.45it/s]

batch loss 0.053100


601it [06:56,  1.44it/s]

batch loss 0.184178


801it [09:16,  1.40it/s]

batch loss 0.096344


1001it [11:38,  1.37it/s]

batch loss 0.160484


1201it [14:01,  1.41it/s]

batch loss 0.176824


1401it [16:22,  1.42it/s]

batch loss 0.139658


1407it [16:26,  1.43it/s]


epoch average loss: 0.109417
开始测试第2轮模型效果：
预测集合条目总量：20007
预测正确条目：19020，预测错误条目：987
预测准确率：0.950667
--------------------


# Predict

In [20]:
class TestDataGenerator:
    def __init__(self, data_path, config):
        self.data_path = data_path
        self.config = config
        self.load()

    def load(self):
        self.data = []
        df = pd.read_csv(self.data_path)
        for i in range(len(df)):
            sequence=[int(x) for x in df["text"].iloc[i].split()]
            encode_x = torch.LongTensor(padding(sequence,config))
            self.data.append(encode_x)
        return

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]
        
def load_test_data(data_path, config, shuffle=True):
    dg = TestDataGenerator(data_path, config)
    dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)
    return dl

In [41]:
def predict_test(config):
    test_data = load_test_data(config["test_data_path"], config,shuffle=False)
    model = TorchModel(config)
    model.load_state_dict(torch.load(config["model_path"]))
    model.eval()
    cuda_flag = torch.cuda.is_available()
    if cuda_flag:
        print("设备GPU可用，迁移模型至GPU")
        model = model.cuda()
        result=[]
    for index, batch_data in tqdm(enumerate(test_data)):
        if cuda_flag:
            batch_data = batch_data.cuda()
        pred_result = model(batch_data)
        result+=torch.argmax(pred_result,dim=1).tolist()
    return result

In [42]:
result=predict_test(config)

设备GPU可用，迁移模型至GPU


391it [00:22, 17.06it/s]


In [44]:
df=pd.read_csv("downloads/test_a_sample_submit.csv")

In [47]:
df["label"]=result

In [49]:
df.to_csv("downloads/test_a_sample_submit.csv",index=False)