# 导包

In [1]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import torch.utils.data as Data

!pip install torchtext==0.4
from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator, BucketIterator

from sklearn.model_selection import train_test_split

import csv
import numpy as np
import pandas as pd

from google.colab import drive
import time
from tqdm import tqdm



In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 固定种子

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(2021)

# 定义超参数

In [4]:
LEN_WORLD=50
BATCH_SIZE = 1
EPOCHS = 2
FIX_LENGTH = 75
N_CLASS=17
LR=0.001
OUT_CHANNEL = 50
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 划分验证集

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/data/track1/tmp/train_total.csv', sep=',')

train, val = train_test_split(train_df, test_size=0.1)
train.to_csv("/content/drive/MyDrive/data/track1/tmp/train.csv", index=False)
val.to_csv("/content/drive/MyDrive/data/track1/tmp/val.csv", index=False)

# 构建Dataset

In [6]:
train_data = pd.read_csv('/content/drive/MyDrive/data/track1/tmp/train.csv')
valid_data = pd.read_csv('/content/drive/MyDrive/data/track1/tmp/val.csv')
test_data = pd.read_csv('/content/drive/MyDrive/data/track1/track1_round1_testA_20210222.csv',sep="\|,\|", header=None, names=['id','text'])

# 对Label的处理
# print([eval(str(train_data['label'][i])) for i in range(len(train_data['label']))])
tokenize_text = lambda x: x.split()
# fix_length指定了每条文本的长度，截断补长
tokenize_label = lambda x: eval(x)
ID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=tokenize_text, lower=True, fix_length=FIX_LENGTH)
LABEL = data.Field(sequential=False, use_vocab=False)

def get_dataset(csv_data, id_field, text_field, label_field, test=False):
	# id数据对训练在训练过程中没用，使用None指定其对应的field
    fields = [("id", id_field), ("text", text_field), ("label", label_field)]       
    examples = []

    if test:
        # 如果为测试集，则不加载label
        for id, text in tqdm(zip(csv_data['id'], csv_data['text'])):
            examples.append(data.Example.fromlist([id, text, None], fields))
    else:
        for text, label in tqdm(zip(csv_data['text'], [eval(str(train_data['label'][i])) for i in range(len(train_data['label']))])):
            examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields


# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_data, None, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid_data, None, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, ID, TEXT, None, test=True)

# 构建Dataset数据集
train = data.Dataset(train_examples, train_fields)
valid = data.Dataset(valid_examples, valid_fields)
test = data.Dataset(test_examples, test_fields)

  This is separate from the ipykernel package so we can avoid doing imports until
9000it [00:00, 51273.23it/s]
1000it [00:00, 44358.82it/s]
3000it [00:00, 23144.09it/s]


# 定义迭代器

In [7]:
train_iter, val_iter = BucketIterator.splits(
        (train, valid),
        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
        device = DEVICE, # 如果使用gpu，此处将-1更换为GPU的编号
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

test_iter = Iterator(test, batch_size=BATCH_SIZE, device=DEVICE, sort=False, train=False, sort_within_batch=False, repeat=False) # train=False可以保证顺序不变

# 加载词向量

In [8]:
vectors = Vectors(name='/content/drive/MyDrive/data/track1/tmp/mymodel.txt')
TEXT.build_vocab(train, vectors=vectors)
weight_matrix = TEXT.vocab.vectors

# 定义模型

In [9]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), LEN_WORLD)     
        self.embedding.weight.data.copy_(weight_matrix)

        self.conv = nn.Sequential(
            # (8, 1, 50, 50)
            nn.Conv2d(1, OUT_CHANNEL, (2, LEN_WORLD)), # input_channel(=1), output_channel, (filter_height, filter_width), stride=1]
            # (8, 25, 49, 1)
            nn.ReLU(),
            nn.MaxPool2d((FIX_LENGTH-1, 1)), # ((filter_height, filter_width))
            # (8, 25, 1, 1)
        )
        self.out = nn.Linear(OUT_CHANNEL, N_CLASS)

    def forward(self, X): # X: [fix_length, batch_size]
        embeds = self.embedding(X)  # [fix_length, batch_size, embedding_size]  
        embeds = self.embedding(X).transpose(0, 1) # [batch_size, sequence_length, embedding_size]
        embeds = embeds.unsqueeze(1) # [batch, channel(=1), sequence_length, embedding_size]
        
        conved = self.conv(embeds)
        flatten = conved.view(BATCH_SIZE, -1)
        y = self.out(flatten)
        return y

model = CNN().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# 训练

In [10]:
start = time.perf_counter()
for epoch in range(EPOCHS):
    for train_batch in train_iter:
        optimizer.zero_grad()
        pred = model(train_batch.text)

        # 因为是向量相乘，不是向量与索引相乘，暂时只能自己写损失函数
        log_prob = torch.nn.functional.log_softmax(pred, dim=1)
        loss = -torch.sum(log_prob * train_batch.label) / BATCH_SIZE
	
        loss.backward()
        optimizer.step()
    if (epoch+1) % 1 == 0:
        end = time.perf_counter()
        print('Epoch: ', (epoch+1), '| train loss: %.6f' % loss, '| time: %.2f s' % (end-start))
        start = time.perf_counter()

Epoch:  1 | train loss: 0.983960 | time: 14.04 s
Epoch:  2 | train loss: -0.000000 | time: 13.91 s


# 评估

In [11]:
def get_acc(y ,p,eps=1e-15):
    loss = 0
    for i in range(len(y)):
        for j in range(len(y[0])):
            y[i][j]=max(eps,min(1-eps,y[i][j]))
            p[i][j]=max(eps,min(1-eps,p[i][j]))
            loss=loss + y[i][j]*np.log(p[i][j]) + (1-y[i][j])*np.log(1-p[i][j])
    acc = 1+loss / (len(y)*len(y[0]))
    return acc

In [12]:
acc=[]
for i, val_batch in enumerate(val_iter):
    p = torch.sigmoid(model(val_batch.text)).cpu().detach().numpy().tolist()
    y = (val_batch.label).cpu().detach().numpy().tolist()
    acc.append(get_acc(y ,p))
print('score: %.4f'% np.mean(acc))

score: 0.4772


# 预测

In [13]:
result=[]
for test_batch in test_iter:
    ids = test_batch.id.cpu().numpy().tolist()
    preds = torch.sigmoid(model(test_batch.text)).cpu().detach().numpy().tolist()
    for i, id in enumerate(ids):
        ob=[]
        pp=[]
        ob.append(str(id)+"|")
        for j in range(len(preds[i])):
            pp.append("{:.2f}".format(preds[i][j]))
        ob.append("|"+" ".join(pp))
        result.append(ob)

# 输出

In [14]:
with open('/content/drive/MyDrive/data/track1/tmp/result.csv','w')as f:
    f_csv = csv.writer(f)
    f_csv.writerows(result)