In [None]:
import pandas as pd
import torch
import numpy as np

BATCHSIZE = 10

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
torch.cuda.set_device(0)

data_root = '/home/henry/nlp-beginner/data/'
data_all=pd.read_csv(data_root+'train.tsv',sep='\t')
pd.set_option('display.width', 900)

idx = np.arange(data_all.shape[0])

In [None]:
np.random.seed(1)
np.random.shuffle(idx)

train_end = int(len(idx) * 0.6)
test_end = int(len(idx)*0.8)

data_all.iloc[idx[:train_end],:].to_csv('data/task2_train.csv', index=False)
data_all.iloc[idx[train_end:test_end],:].to_csv('data/task2_test.csv', index=False)
data_all.iloc[idx[test_end:],:].to_csv('data/task2_dev.csv', index=False)

In [None]:
from torchtext import data

PAD_TOKEN = '<pad>'
# Sequential为True则需要Tokenize，反之则不需要
TEXT = data.Field(sequential=True, batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)

datafields = [("PhraseId", None),
              ("SentenceId", None),
                ("Phrase", TEXT),
                ("Sentiment", LABEL)]

# datafileds定义了每一列的处理方式，可以作为参数传入TabularDataset.splits()中
# 作为splits函数参数的datafields必须是List of tuples格式——(name, None)或者(name, field)
train_data, dev_data, test_data = data.TabularDataset.splits(
    path = 'data/', 
    train='task2_train.csv',
    validation = 'task2_dev.csv',
    test='task2_test.csv',
    format = 'csv',
    fields = datafields
)

TEXT.build_vocab(train_data, vectors='glove.6B.50d', unk_init = lambda x:torch.nn.init.uniform_(x,a=-0.25,b=0.25))
LABEL.build_vocab(train_data)

# Set PAD_TOKEN row to be all zero
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
print(PAD_INDEX)
TEXT.vocab.vectors[PAD_INDEX] = 0.0


In [None]:
train_iterator = data.BucketIterator(train_data, batch_size = BATCHSIZE, train=True, shuffle=True, device=DEVICE)
dev_iterator = data.Iterator(dev_data, batch_size=len(dev_data), train=False, sort=False, device=DEVICE)
test_iterator = data.Iterator(test_data, batch_size=len(test_data), train=False, sort=False, device=DEVICE)

In [None]:
import torch.nn.functional as F
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, embedding_choice, num_embeddings, embedding_dim, num_filter, label_num, p=0.5):
        super(CNN,self).__init__()
        self.embedding_choice = embedding_choice
        
        if self.embedding_choice == 'rand':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        if self.embedding_choice == 'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim).from_pretrained(TEXT.vocab.vectors, freeze=True)
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=num_filter, kernel_size=(3, embedding_dim), padding=(2,0))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=num_filter, kernel_size=(4, embedding_dim), padding=(3,0))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=num_filter, kernel_size=(5, embedding_dim), padding=(4,0))

        self.linear = nn.Linear(in_features=3 * num_filter, out_features=label_num)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)   # (Batch_size, 1, Length, Emb_dim)

        x1 = F.relu(self.conv1(x).squeeze(-1))  # (Batch_size, filter_num, Length+pad_len)
        x2 = F.relu(self.conv2(x).squeeze(-1))
        x3 = F.relu(self.conv3(x).squeeze(-1))

        x1 = F.max_pool1d(x1, x1.size(2)).squeeze()    # (Batch_size, filter_num)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze()    # (Batch_size, filter_num)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze()    # (Batch_size, filter_num)

        x = self.dropout(torch.cat((x1, x2, x3), dim=1))
        out = self.linear(x)
        return out

In [None]:
model = CNN('glove', len(TEXT.vocab), TEXT.vocab.vectors.shape[1], 128, len(LABEL.vocab), 0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

if USE_CUDA:
    model.cuda()

In [None]:
import time

epoch = 100
best_accuracy = 0.0
start_time = time.time()

for i in range(epoch):
    model.train()
    total_loss = 0.0

    accuracy = 0.0
    total_correct = 0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0

    for batch in train_iterator:
        steps += 1
        optimizer.zero_grad()

        batch_text = batch.Phrase
        batch_label = batch.Sentiment
        out = model(batch_text)
        loss = criterion(out, batch_label)
        total_loss =  total_loss + loss.item()

        loss.backward()
        optimizer.step()

        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct += correct.item()

        if steps%100 == 0:
            print("Epoch %d_%.3f%%: Training avarage Loss: %f"
            %(i, steps*train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps))

    # test in each epoch
    model.eval()
    total_loss = 0.0
    accuracy = 0.0
    total_correct = 0.0
    total_data_num = len(dev_iterator.dataset)
    steps = 0.0
    for batch in dev_iterator:
        steps += 1
        batch_text = batch.Phrase
        batch_label = batch.Sentiment
        out = model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        correct = (torch.max(out, dim=1)[1].view(batch_label.size())==batch_label).sum()
        total_correct = total_correct + correct.item()
        print("Epoch %d: Verification avarage Loss: %f, Verification accuracy: %f%%, Total Time:%f"
            %(i, total_loss/steps,100*total_correct/total_data_num, time.time()-start_time))
    
        if best_accuracy < total_correct/total_data_num:
            best_accuracy = total_correct/total_data_num
            torch.save(model, 'model_dict/model_glove2/epoch_%d_accuracy_%f'%(i, total_correct/total_data_num))
            print('Model is saved in model_dict/model_glove2/epoch_%d_accuracy_%f'%(i, total_correct/total_data_num))