In [None]:
from nltk import WordPunctTokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# import tensorflow as tf
from tqdm import tqdm
from torch.utils import data
import nltk
import re
from nltk.stem import WordNetLemmatizer
import pickle
import torch
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from sklearn import metrics
from torch.utils.data import DataLoader
cache_dir = 'cache'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
!pip install nltk
!pip install gensim
!pip install python-Levenshtein
!pip install tensorflow-gpu

# 1 参数

In [None]:

class Config(object):
    def __init__(self):
        self.train_all = "data/pseudo_concat_train.csv"
        # 合并title和abstract的csv
        self.train_path = "predata/train_x.csv"
        self.valid_path = "predata/valid_x.csv"
        self.testall_path = "data/test.csv"
        self.test_path = "predata/test_x.csv"
        # 词典
        self.vocab_path = "data/vocab.pkl"
        # label的类别
        self.label_path = "predata/label_id2cate.pkl"

        self.process_trainset_path = "predata/train_set.npy"
        self.process_trainlabel_path = "predata/train_label.npy"
        self.process_testset_path = "predata/test_set.npy"
        self.fastText_path = "model/fasttext.bin"
        self.word2vec_path = "model/word2vec.bin"
        self.glove_path  = "model/glove.bin"
        # 嵌入长度
        self.embedding_size = 128
        # 词典最大词数
        self.max_vocab_size = 50000
        # 词向量维度
        self.max_len = 128
        # 分类类别数
        self.num_class = 39
        # 模型保存路径
        self.save_path = "saved/"
        self.batch_size = 1000
        self.lr = 0.001
        self.num_epochs = 8# 50
        self.model = "TextCNN"
args = Config()


# 2 模型

## 2.2 TextCNN

In [None]:
class TextCNN(nn.Module):
    def __init__(self, args, pretrained_path):
        super(TextCNN, self).__init__()
        self.dim_embed = args.embedding_size
        self.dropout = 0.4
        self.num_filters = 256
        self.kernel_size = (4, 5, 3)
        self.max_len = args.max_len
        self.n_vocab = pretrained_path.shape[0]  # 不使用预训练词向量时的词典长度
        self.num_classes = args.num_class# f分类类别数
        self.pretrained = True
        self.pretrained_path = pretrained_path

        if self.pretrained: 
            self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
        else:
            self.embedding = nn.Embedding(self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)

        self.conv1 = nn.Conv2d(1, self.num_filters, (self.kernel_size[0], self.dim_embed))
        self.conv2 = nn.Conv2d(1, self.num_filters, (self.kernel_size[1], self.dim_embed))
        self.conv3 = nn.Conv2d(1, self.num_filters, (self.kernel_size[2], self.dim_embed))
        self.max_pool1 = nn.MaxPool2d((self.max_len - self.kernel_size[0] + 1, 1))
        self.max_pool2 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
        self.max_pool3 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.num_filters * 3, self.num_classes)

    def forward(self, x):
        batch_size = x.shape[0]

        x = self.embedding(x)              # [batch_size, max_len, dim_embed]
        x = x.unsqueeze(1)                 # [batch_size, 1, max_len, dim_embed]

        x1 = F.relu(self.conv1(x))         # [batch_size, num_filters, max_len-kernel_size[0], 1]
        x2 = F.relu(self.conv2(x))         # [batch_size, num_filters, max_len-kernel_size[1], 1]
        x3 = F.relu(self.conv3(x))         # [batch_size, num_filters, max_len-kernel_size[2], 1]

        x1 = self.max_pool1(x1)            # [batch_size, num_filters, 1, 1]
        x2 = self.max_pool2(x2)            # [batch_size, num_filters, 1, 1]
        x3 = self.max_pool3(x3)            # [batch_size, num_filters, 1, 1]

        x = torch.cat((x1, x2, x3), -1)    # [batch_size, num_filters, 1, 3]
        x = x.view(batch_size, 1, -1)      # [batch_size, 1, num_filters*3]

        x = self.dropout(x)
        x = self.fc(x)                     # [batch_size, 1, 2]
        x = x.view(-1, self.num_classes)   # [batch_size, 2]

        return x

## 2.2 FastText

In [None]:
class FastText(nn.Module):
    def __init__(self, args, pretrained_path):
        super(FastText, self).__init__()
        self.dim_embed = args.embedding_size
        self.hidden_size = 256
        self.n_vocab = pretrained_path.shape[0]
        self.num_classes = args.num_class
        self.pretrained = True
        self.pretrained_path = pretrained_path
        # 在文本分类任务中，0.6精度最高
        self.dropout = 0.6
        if self.pretrained:
            self.embedding = nn.Embedding.from_pretrained(
                self.pretrained_path, freeze=False)
        else:
            self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)

        self.dropout = nn.Dropout(self.dropout)
        self.fc1 = nn.Linear(self.dim_embed, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, x):
        batch_size = x.shape[0]

        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

## 2.3 TextRNN

In [None]:
class TextRNN(nn.Module):
    def __init__(self, args, pretrained_path):
        super(TextRNN, self).__init__()
        self.pretrained = True
        self.pretrained_path = pretrained_path
        self.n_vocab = pretrained_path.shape[0]
        self.dim_embed = args.embedding_size
        self.hidden_size = 64
        self.num_layers = 2
        self.dropout = 0.4
        self.num_classes = args.num_class

        if self.pretrained:
            self.embedding = nn.Embedding.from_pretrained(
                self.pretrained_path, freeze=False)
        else:
            self.embedding = nn.Embedding(
                self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)

        self.lstm = nn.LSTM(self.dim_embed, self.hidden_size, self.num_layers,
                            bidirectional=True, batch_first=True, dropout=self.dropout)
        self.fc = nn.Linear(self.hidden_size * 2, self.num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

## 2.4 DPCNN

In [None]:
class DPCNN(nn.Module):
    def __init__(self, args, pretrained_path):
        super(DPCNN, self).__init__()
        self.dim_embed = args.embedding_size
        self.num_filters = 256
        self.kernel_size = 3
        self.n_vocab = pretrained_path.shape[0]
        self.num_classes = args.num_class
        self.pretrained = True
        self.pretrained_path = pretrained_path

        if self.pretrained: 
            self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
        else:
            self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)

        self.conv_region = nn.Conv2d(1, self.num_filters, (self.kernel_size, self.dim_embed), stride=1)
        self.conv = nn.Conv2d(self.num_filters, self.num_filters, (self.kernel_size, 1), stride=1)
        self.max_pool = nn.MaxPool2d(kernel_size=(self.kernel_size, 1), stride=2)
        self.padding1 = nn.ZeroPad2d((0, 0, 1, 1))  # top bottom
        self.padding2 = nn.ZeroPad2d((0, 0, 0, 1))  # bottom
        self.relu = nn.ReLU()
        self.fc = nn.Linear(self.num_filters, self.num_classes)

    def forward(self, x):
        x = self.embedding(x)     # [batch_size, max_len, dim_embed]
        x = x.unsqueeze(1)        # [batch_size, 1, max_len, dim_embed]
        x = self.conv_region(x)   # [batch_size, num_filters, max_len-kernel_size, 1]
        x = self.padding1(x)      # [batch_size, num_filters, max_len, 1]
        x = self.relu(x)
        x = self.conv(x)          # [batch_size, num_filters, max_len-kernel_size, 1]
        x = self.padding1(x)      # [batch_size, num_filters, max_len, 1]
        x = self.relu(x)
        x = self.conv(x)          # [batch_size, num_filters, max_len-kernel_size, 1]
        while x.size()[2] > 2:
            x = self._block(x)    # [batch_size, num_filters, 1, 1]
        x = x.squeeze()           # [batch_size, num_filters]
        x = self.fc(x)            # [batch_size, num_classes]

        return x

    def _block(self, x):
        x = self.padding2(x)
        px = self.max_pool(x)

        x = self.padding1(px)
        x = self.relu(x)
        x = self.conv(x)

        x = self.padding1(x)
        x = self.relu(x)
        x = self.conv(x)

        # Short Cut
        x = x + px
        return x

## 2.5 对抗训练

In [None]:
#对抗训练

class PGD():
    def __init__(self, model):
        self.model = model
        self.emb_backup = {}
        self.grad_backup = {}
        

    def attack(self, epsilon=1., alpha=0.3, emb_name='emb', is_first_attack=1):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            
            if param.requires_grad and emb_name in name:
                if is_first_attack==0:
                    self.emb_backup[name] = param.data.clone()
                   
                    
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = alpha * param.grad / norm
                    param.data.add_(r_at)
                    param.data = self.project(self.emb_backup,name, param.data, epsilon)

    def restore(self, emb_name='emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                if len(self.emb_backup)==0:
                    continue
                assert name in self.emb_backup
                param.data = self.emb_backup[name]
        self.emb_backup = {}

    def project(self,t, param_name, param_data, epsilon):
        #assert param_name in self.emb_backup
        if param_name not in self.emb_backup:
            return param_data
       
        r = param_data - t[param_name]
        if torch.norm(r) > epsilon:
            r = epsilon * r / torch.norm(r)
        return t[param_name] + r

    def backup_grad(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.grad_backup[name] = param.grad.clone()

    def restore_grad(self):
        for name, param in self.model.named_parameters():
            if len(self.grad_backup)==0:
                continue
            if param.requires_grad:
                param.grad = self.grad_backup[name]
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
        print(model.named_parameters())

    def attack(self, epsilon=1., emb_name='embedding'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='embedding'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

# 3 训练集预处理和封装

## 3.1 去除停用词

In [None]:
import torch.nn.functional as F
import torch.nn as nn
from nltk.stem import WordNetLemmatizer
import re
import nltk
import torch
import numpy as np
from torch.utils import data
from sklearn.utils import resample

# en_stop = set(STOP_WORDS)

# custom_stop_words = [
#     'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
#     'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
#     'al.', 'elsevier', 'pmc', 'czi', 'www'
# ]
# for word in custom_stop_words:
#     en_stop.add(word)

en_stop = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
    'al.', 'elsevier', 'pmc', 'czi', 'www','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
    'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
    'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
    'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
    'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
    's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
    'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'
]
def preprocess_text(document):
    stemmer = WordNetLemmatizer()

    document = str(document)
    document = document.replace("\n", ' ')
    document = document.replace("/'", '')

    document = re.sub(r'\W', ' ', document)

    # 删除所有单个字符
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # 从开头删除单个字符
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # 用单个空格替换多个空格
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # 删除前缀“b”
    # document = re.sub(r'^b\s+', '', document)

    # 数字泛化：，所有大于9的数字都被hashs替换了。即成为# #,123变成# # #或15.80€变成# #,# #€。
    document = re.sub('[0-9]{5,}', '#####', document)
    document = re.sub('[0-9]{4}', '####', document)
    document = re.sub('[0-9]{3}', '###', document)
    document = re.sub('[0-9]{2}', '##', document)
    # 转换为小写
    document = document.lower()

    # 词形还原
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    # 去停用词
    tokens = [word for word in tokens if word not in en_stop]
    # 去低频词
    tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text
    return document


def load_data_kfold(dataset,batch_size, k, n):
    print("Stacking第{}折正在划分数据集".format(n+1))

    l = len(dataset)
    print(l)
    shuffle_dataset = True
    random_seed = 42  # fixed random seed
    indices = list(range(l))

    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)  # shuffle
    # Collect indexes of samples for validation set.
    val_indices = indices[int(l / k) * n:int(l / k) * (n + 1)]
    train_indices = list(set(indices).difference(set(val_indices)))
    train_sampler = data.SubsetRandomSampler(train_indices)  # build Sampler
    valid_sampler = data.SubsetRandomSampler(val_indices)
    train_loader = data.DataLoader(dataset, batch_size=batch_size,
                                               sampler=train_sampler)  # build dataloader for train set
    validation_loader = data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler)  # build dataloader for validate set
    print("划分完成")
    return train_loader, validation_loader

## 3.2 训练词向量

In [None]:
import numpy as np
import pickle as pkl
from torch.utils import data
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from tqdm import tqdm
from nltk import WordPunctTokenizer
import tensorflow as tf
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import torch
from nltk import WordPunctTokenizer

In [None]:
import nltk
from torch.utils import data
from tqdm import tqdm
# import tensorflow as tf
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import FastText
import torch
from glove import Glove
from glove import Corpus
from nltk import WordPunctTokenizer
def build_word2vec(args,train):
    trainall_title = list(train['title'])
    trainall_abstract = list(train['abstract'])
    trainall_combine = np.empty_like(trainall_title)
    for i in range(len(trainall_title)):
        trainall_combine[i] = trainall_title[i] + ' <sep> ' + trainall_abstract[i]
    # Prepare FastText Training Data
    print('构造词向量：删除不需要字符...')
    final_corpus = [preprocess_text(sentence) for sentence in trainall_combine if sentence.strip() != '']
    print('构造词向量：分词...')
    word_punctuation_tokenizer = WordPunctTokenizer()
    word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]
    # 选择单词编码工具
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True)
    tokenizer.fit_on_texts(word_tokenized_corpus)

    # Train a FastText Model:
    # embedding_size = 128
    window_size = 40
    min_word = 5
    down_sampling = 1e-2
    vector_tag = 'fasttext'
    print('构造词典：训练词向量...')
    # 选择Fasttext词向量
    if vector_tag =='fasttext':
        if os.path.exists(args.fastText_path):
            ft_model = FastText.load(args.fastText_path)
        else:
            print("正在训练fasttext词向量")
            ft_model = FastText(word_tokenized_corpus,
                                vector_size=args.embedding_size,
                                window=window_size,
                                min_count=min_word,
                                sample=down_sampling,
                                sg=1,
                                epochs=40)
            #save your model as
            print("word2vec")
            wv_model=Word2Vec(word_tokenized_corpus, vector_size=args.embedding_size, min_count=5, epochs=50)
            print("glove")
    elif vector_tag =='glove':
        if os.path.exists(args.glove_path):
            corpus_model = Corpus()
            corpus_model.fit(word_tokenized_corpus, window=5)
            gl_model = Glove(no_components=args.embedding_size, learning_rate=0.05)
            gl_model.fit(corpus_model.matrix, epochs=20,no_threads=1, verbose=True)
            gl_model.add_dictionary(corpus_model.dictionary)
            gl_model.save('glove.bin')
         else:
           gl_model = Glove.load(args.glove_path)
    elif vector_tag =='word2vec':
        if os.path.exists(args.word2vec_path):
            ft_model = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
        else:
            print("正在训练word2vec词向量")
            ft_model = Word2Vec(
                word_tokenized_corpus, size=args.embedding_size, min_count=min_word, sg=1, iter=20)
            ft_model.wv.save_word2vec_format(args.word2vec_path, binary=True)
    # Extract fasttext learned embedding and put them in a numpy array 初始化空的嵌入矩阵
    embedding_matrix_ft1 = np.random.random(
        (len(tokenizer.word_index) + 1,args.embedding_size))
    embedding_matrix_ft2 = np.random.random(
        (len(tokenizer.word_index) + 1,args.embedding_size))
    embedding_matrix_ft3 = np.random.random(
        (len(tokenizer.word_index) + 1, args.embedding_size))
    pas = 0
    # 预训练矩阵
    for word, i in tokenizer.word_index.items():

        try:
            #embedding_matrix_ft1[i] = ft_model.wv[word]
            embedding_matrix_ft2[i] = wv_model.wv[word]
           # embedding_matrix_ft3[i] = gl_model.word_vectors[model.dictionary[word]]
        except:
            pas += 1
    # 如果需要，选择合并三种词向量，就去掉此处注释
    # embedding_matrix_ft=np.concatenate([embedding_matrix_ft1, embedding_matrix_ft2, embedding_matrix_ft3], axis=1)
    print(embedding_matrix_ft.shape)
    # 只选择Fasttext的词向量矩阵
    # return embedding_matrix_ft1, tokenizer
    # 只选择word2vec的词向量矩阵
    return embedding_matrix_ft2, tokenizer
    # 只选择glove的词向量
    # return embedding_matrix_ft3, tokenizer
    # 选择合并三种词向量的矩阵
    # return embedding_matrix_ft, tokenizer

## 3.3 封装训练集

In [None]:
class PaperData(data.Dataset):
    def __init__(self, args, tokenizer,split='train'):
        self.texts = []
        self.labels = []
        self.args = args
        self.split = split
        self.tokenizer = tokenizer

        text_tokenizer = WordPunctTokenizer()
        if self.split == "train":
            print("训练集预处理...")
            if not os.path.exists(args.process_trainset_path):
                train = pd.read_csv(self.args.train_path)
                word_tokenized_corpus = []
                for text in tqdm(train['text']):
                    textp = preprocess_text(text)
                    tokentext = text_tokenizer.tokenize(textp)
                    word_tokenized_corpus.append(tokentext)

                print('训练集预处理：分词...')
                sequence_train = tokenizer.texts_to_sequences(word_tokenized_corpus)
                sequence_train = tf.keras.preprocessing.sequence.pad_sequences(sequence_train, maxlen=args.embedding_size)

                self.texts = sequence_train
                self.labels = list(train['label'])
                np.save(args.process_trainset_path,sequence_train)
                np.save(args.process_trainlabel_path, np.array(self.labels))
            else:
                train_set = np.load(args.process_trainset_path)
                train_label = np.load(args.process_trainlabel_path)
                self.texts = list(train_set)
                self.labels = list(train_label)
        elif self.split == "test":
            print("测试集预处理...")
            if not os.path.exists(args.process_testset_path):
                test = pd.read_csv(self.args.test_path, sep='\t')
                # 拼接title与abstract
                # test['text'] = test['title'] + ' ' + test['abstract']
                word_tokenized_corpus = []
                # for i, text in test['text'].items():
                for text in tqdm(test['text']):
                    textp = preprocess_text(text)
                    tokentext = text_tokenizer.tokenize(textp)
                    word_tokenized_corpus.append(tokentext)
                print('测试集预处理：分词...')
                sequence_test = tokenizer.texts_to_sequences(word_tokenized_corpus)
                sequence_test = tf.keras.preprocessing.sequence.pad_sequences(sequence_test, maxlen=args.embedding_size)

                self.texts = sequence_test
                self.labels = [-1 for i in range(len(test))]
                np.save(args.process_testset_path, sequence_test)
            else:
                test_set = list(np.load(args.process_testset_path))
                self.texts = test_set
                self.labels = [-1 for i in range(len(test_set))]
        else:
            raise Exception("No file for split %s" % self.split)

        assert len(self.texts) == len(self.labels)

    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):

        _text = self.texts[index]
        _label = self.labels[index]

        sample = {'text': _text, 'label': _label}
        return self.transform(sample)

    def transform(self, sample):
        text = sample['text']
        label = sample['label']

        text = np.array(text)
        label = np.array(label)
        text = torch.from_numpy(text).to(torch.int64).to(DEVICE)
        label = torch.from_numpy(label).to(torch.int64).to(DEVICE)

        return {'text': text, 'label': label}

In [None]:
# 截断文本，通过测试，200的时候截断效果最佳
first_n_words = 200
def trim_string(x):
    x =str(x)
    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])
    return x
def make_data_loader(args):
#     train = pd.read_csv(args.train_all, sep='\t')
    train = pd.read_csv(args.train_all)
    if not os.path.exists(args.train_path):
        label_id2cate = dict(enumerate(train.categories.unique()))
        label_cate2id = {value: key for key, value in label_id2cate.items()}
        with open(args.label_path, 'wb') as f:
            pickle.dump(label_id2cate, f, pickle.HIGHEST_PROTOCOL)
        train_x = pd.DataFrame(columns=['text', 'label'])
        # 拼接title与abstract
        train['text'] = train['title'] + ' ' + train['abstract']
        train_x['label'] = train['categories'].map(label_cate2id)
        train_x['text'] = train['text'].apply(trim_string)
        train_x.to_csv(args.train_path, index=False)
        test = pd.read_csv(args.testall_path, sep='\t')
        # 测试集预处理
        test_x = pd.DataFrame(columns=['text'])
        # 拼接title与abstract
        test['text'] = test['title'] + ' ' + test['abstract']
        test_x['text'] = test['text'].apply(trim_string)
        test_x.to_csv(args.test_path,index=False)
    else:
        test_x = pd.read_csv(args.test_path)
        train_x = pd.read_csv(args.train_path)
        with open(args.label_path, 'rb') as f:
            label_id2cate = pickle.load(f)
    embedding_matrix_ft, tokenizer = build_word2vec(args, train)
    train_set = PaperData(args, tokenizer=tokenizer,split='train')
    test_set = PaperData(args,tokenizer = tokenizer, split='test')
    test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False)
    return embedding_matrix_ft, train_set, test_loader, label_id2cate
   

# 4 训练模型

In [None]:
# 初始网络
def init_network(model, method='kaiming', exclude='embedding', seed=123):  # method='kaiming'
    for name, w in model.named_parameters():
        if exclude not in name:
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)

# def train(args, model, train_iter,val_iter):
def train(args, model, train_set, test_iter, label_id2cate):
    k_fold = 10
    predict_all = np.zeros([10000,39])#存储测试集的 预测结果
    K = 3
    for n in range(k_fold):
        # K折划分
        train_iter, val_iter = load_data_kfold(train_set, args.batch_size, k_fold, n)
        model.train()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        total_batch = 0
        dev_best_acc = 0
        loss_avg = []
        for epoch in range(args.num_epochs):
#             fgm=FGM(model)
            print('Fold=[{}/{}] Epoch [{}/{}]'.format(n+1,k_fold,epoch + 1, args.num_epochs))
#             pgd = PGD(model)
            for i, data in enumerate(train_iter):
                text = data["text"]
                label = data["label"]
                outputs = model(text)
                model.zero_grad()
                loss = F.cross_entropy(outputs, label)
                loss_avg.append(loss.item())
                loss.backward()
                # FGM对抗训练
#                 fgm.attack() # 在embedding上添加对抗扰动
#                 loss_adv = F.cross_entropy(model(text), label)
#                 loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
#                 fgm.restore() # 恢复embedding参数
                
#                 pgd.backup_grad()
#                 # 对抗训练
#                 for t in range(K):
#                     pgd.attack(is_first_attack=t) #embedding上添加对抗扰动, first attack时备份param.data
#                     if t != K-1:
#                         model.zero_grad()
#                     else:
#                         pgd.restore_grad()
#                     loss_adv = F.cross_entropy(model(text), label)
#                     loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
#                     pgd.restore() # 恢复embedding参数
                
                optimizer.step()
                total_batch += 1
                y_true = label.data.cpu()
                y_pred = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(y_true, y_pred)
            dev_acc, dev_loss = evaluate(model, val_iter)

            if dev_acc > dev_best_acc:
                dev_best_acc = dev_acc
                torch.save(model.state_dict(), args.save_path +'/' + args.model + '.ckpt')
                # print("saved model, best acc on dev: %.4f" % dev_acc)

            msg = 'Iter:{0} train_loss: {1:.3} train_acc: {2:.2%} val_oss: {3:.2} val_cc: {4:.3%}'
            print(msg.format(total_batch, np.mean(loss_avg),
                            train_acc, dev_loss, dev_acc))
            model.train()
        
        result = []
        with torch.no_grad():
            for i, data in enumerate(test_iter):
                text = data["text"]
                outputs = model(text)
                y_pred = outputs.data.cpu().numpy()
                result.extend(y_pred)
        predict_all += np.array(result)
    avg_predict = predict_all/k_fold
    predict_kfold(avg_predict, args.model, label_id2cate)

# 验证
def evaluate(model, val_iter):
    model.eval()
    loss_total = 0
    y_preds = []
    y_trues = []
    with torch.no_grad():
        for data in val_iter:
            text = data["text"]
            label = data["label"]
            outputs = model(text)
            loss = F.cross_entropy(outputs, label)
            loss_total += loss
            y_true = label.tolist()
            y_pred = torch.max(outputs.data, 1)[1].cpu().tolist()
            y_trues.extend(y_true)
            y_preds.extend(y_pred)
    acc = metrics.accuracy_score(y_trues, y_preds)
    return acc, loss_total/len(val_iter)
# 生成提交文件
def predict_kfold(avg_predict, model_name, label_id2cate):

    result = np.argmax(avg_predict, axis=1)
    sub = pd.read_csv('./data/sample_submit.csv')
    sub['categories'] = list(result)
    sub['categories'] = sub['categories'].map(label_id2cate)
    sub.to_csv('submit/submit_{}.csv'.format(model_name), index=False)


In [None]:
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
# 加载数据集
embedding_matrix_ft, train_set, test_loader, label_id2cate = make_data_loader(args)
pretrained_path = torch.FloatTensor(embedding_matrix_ft)
print("加载完成")

In [None]:
if args.model == "TextCNN":  # 0.8076(epoch=8)
    model = TextCNN(args, pretrained_path)
elif args.model == "FastText":  # 0.8070 (epoch=20)
    model = FastText(args, pretrained_path)
elif args.model == "TextRNN":  
    model = TextRNN(args, pretrained_path)
elif args.model == "DPCNN":
    model = DPCNN(args, pretrained_path)

model.to(DEVICE)


# 开始训练
train(args, model, train_set, test_iter, label_id2cate)