# 导包

In [1]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import torch.utils.data as Data

!pip install torchtext==0.4
from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator, BucketIterator

from sklearn.model_selection import train_test_split

import csv
import numpy as np
import pandas as pd

from tqdm import tqdm
import zipfile
import os
import shutil

Collecting torchtext==0.4
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 129 kB/s eta 0:00:01     |████████████▍                   | 20 kB 2.7 MB/s eta 0:00:01
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.8.0a0+cd6902d
    Uninstalling torchtext-0.8.0a0+cd6902d:
      Successfully uninstalled torchtext-0.8.0a0+cd6902d
Successfully installed torchtext-0.4.0


# 固定种子

In [2]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(2021)

# 定义超参数

In [3]:
BATCH_SIZE = 8
EPOCHS = 2
FIX_LENGTH = 40
N_CLASS=2
LR=0.001
OUT_CHANNEL=20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 划分训练集验证集

In [4]:
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')

train, val = train_test_split(train_df, test_size=0.2)
train.to_csv("./train.csv", index=False)
val.to_csv("./val.csv", index=False)

# 构建Dataset

In [5]:
train_data = pd.read_csv('./train.csv')
valid_data = pd.read_csv('./val.csv')
test_data = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

tokenize_text = lambda x: x.split()
# fix_length指定了每条文本的长度，截断补长
TEXT = data.Field(sequential=True, tokenize=tokenize_text, lower=True, fix_length=FIX_LENGTH)
LABEL = data.Field(sequential=False, use_vocab=False)

def get_dataset(csv_data, id_field, text_field, label_field, test=False):
	# id数据对训练在训练过程中没用，使用None指定其对应的field
    fields = [("id", None), ("text", text_field), ("label", label_field)]       
    examples = []

    if test:
        # 如果为测试集，则不加载label
        for text in tqdm(csv_data['question_text']):
            examples.append(data.Example.fromlist([None, text, None], fields))
    else:
        for text, label in tqdm(zip(csv_data['question_text'], csv_data['target'])):
            examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields


# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_data, None, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid_data, None, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, None, TEXT, None, test=True)

# 构建Dataset数据集
train = data.Dataset(train_examples, train_fields)
valid = data.Dataset(valid_examples, valid_fields)
test = data.Dataset(test_examples, test_fields)

1044897it [00:18, 58004.90it/s]
261225it [00:04, 56780.91it/s]
100%|██████████| 375806/375806 [00:05<00:00, 65624.24it/s]


# 构建迭代器

In [6]:
train_iter, val_iter = BucketIterator.splits(
        (train, valid),
        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
        device = DEVICE, # 如果使用gpu，此处将-1更换为GPU的编号
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

test_iter = Iterator(test, batch_size=BATCH_SIZE, device=DEVICE, sort=False, train=False, sort_within_batch=False, repeat=False) # train=False可以保证顺序不变

# 加载词向量

In [7]:
def un_zip(file_name):
    """unzip zip file"""
    zip_file = zipfile.ZipFile(file_name)
    if os.path.isdir('./'+file_name.split('/')[-1].split('.')[0]):
        pass
    else:
        os.mkdir('./'+file_name.split('/')[-1].split('.')[0])
    for names in zip_file.namelist():
        zip_file.extract(names, './'+file_name.split('/')[-1].split('.')[0])
    zip_file.close()

In [8]:
un_zip('../input/quora-insincere-questions-classification/embeddings.zip')
shutil.rmtree('./embeddings/wiki-news-300d-1M')
shutil.rmtree('./embeddings/GoogleNews-vectors-negative300')
shutil.rmtree('./embeddings/paragram_300_sl999')

KeyboardInterrupt: 

In [None]:
vectors = Vectors(name='./embeddings/glove.840B.300d/glove.840B.300d.txt')
TEXT.build_vocab(train, vectors=vectors)
weight_matrix = TEXT.vocab.vectors

# 定义模型

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), 300)     
        self.embedding.weight.data.copy_(weight_matrix)

        self.conv = nn.Sequential(
            nn.Conv2d(1, OUT_CHANNEL, (2, 300)), # input_channel(=1), output_channel, (filter_height, filter_width), stride=1
            nn.ReLU(),
            nn.MaxPool2d((FIX_LENGTH-1, 1)), # ((filter_height, filter_width))
        )
        self.out = nn.Linear(OUT_CHANNEL, N_CLASS)

    def forward(self, X): # X: [fix_length, batch_size]
        batch_size = np.shape(X)[1]
        embeds = self.embedding(X)  # [fix_length, batch_size, embedding_size]  
        embeds = self.embedding(X).transpose(0, 1) # [batch_size, sequence_length, embedding_size]
        embeds = embeds.unsqueeze(1) # [batch, channel(=1), sequence_length, embedding_size]
        
        conved = self.conv(embeds)
        flatten = conved.view(batch_size, -1)
        y = self.out(flatten)
        return y

model = CNN().to(DEVICE)
loss_fc = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# 训练

In [None]:
for epoch in range(EPOCHS):
    for batch in tqdm(train_iter):
        optimizer.zero_grad()
        pred = model(batch.text)

        loss = loss_fc(pred, batch.label)
        loss.backward()
        optimizer.step()

# 评估

In [None]:
true = 0.0
all = 0.0
for val_batch in tqdm(val_iter):
    pred_y = torch.max(model(val_batch.text), 1)[1].cpu().data.numpy()
    real_y = val_batch.label.cpu().data.numpy()
    true += float((pred_y == real_y).astype(int).sum())
    all += float(len(real_y))
accuracy = true / all

print(accuracy)

# 预测

In [None]:
result=[]
for i, test_batch in enumerate(test_iter):
    for j, label in enumerate(torch.max(model(test_batch.text), 1)[1].cpu().data.numpy()):
            result.append([test_data['qid'][i*BATCH_SIZE+j],label])

In [None]:
header=['qid','prediction']
with open("./result.csv", "a", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(result)