In [37]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
sns.set()

In [38]:
import torch
from torch.autograd import Variable
from torch import nn, optim
from torch.utils.data import random_split, Dataset, DataLoader

In [45]:
word2idx = datasets["word2idx"]
idx2word = datasets["idx2word"]

array({'<START>': 0, '欧': 1, '墙': 2, '骠': 3, '番': 4, '勇': 5, '诸': 6, '妓': 7, '刮': 8, '柱': 9, '寮': 10, '湲': 11, '牒': 12, '曚': 13, '夕': 14, '劬': 15, '琳': 16, '缙': 17, '龙': 18, '峤': 19, '欻': 20, '勍': 21, '览': 22, '蕙': 23, '耸': 24, '柄': 25, '鶬': 26, '蠢': 27, '熬': 28, '示': 29, '丞': 30, '刀': 31, '旌': 32, '每': 33, '悬': 34, '溃': 35, '盛': 36, '浔': 37, '潇': 38, '并': 39, '瑞': 40, '末': 41, '觞': 42, '俱': 43, '徐': 44, '兕': 45, '葬': 46, '雨': 47, '生': 48, '斯': 49, '咏': 50, '担': 51, '距': 52, '劲': 53, '效': 54, '监': 55, '襄': 56, '挐': 57, '禺': 58, '华': 59, '渡': 60, '集': 61, '筐': 62, '齑': 63, '艓': 64, '娄': 65, '裨': 66, '浼': 67, '泻': 68, '蟀': 69, '书': 70, '嵩': 71, '奖': 72, '飏': 73, '哭': 74, '像': 75, '浊': 76, '藕': 77, '琏': 78, '芝': 79, '村': 80, '痊': 81, '汨': 82, '嵘': 83, '祥': 84, '符': 85, '龄': 86, '鼎': 87, '墀': 88, '闪': 89, '铲': 90, '霸': 91, '萼': 92, '估': 93, '茶': 94, '扈': 95, '？': 96, '猊': 97, '狂': 98, '遶': 99, '彩': 100, '畬': 101, '耦': 102, '，': 103, '侪': 104, '着': 105, '韭': 106, '筹': 107, '伫': 108, '姨': 10

In [54]:
datasets = np.load("final set.npz")
datasets.allow_pickle = True
my_mapping = datasets["use_word2idx"]
dataset = datasets["dataset"]

dufu = torch.from_numpy(dataset.item()["dufu"])
sushi = torch.from_numpy(dataset.item()["sushi"])

label_dufu = torch.full((len(dufu), ), fill_value=0)
label_sushi = torch.full((len(sushi), ), fill_value=1)

final_label = torch.cat((label_dufu, label_sushi), dim=0)
final_dataset = torch.cat((dufu, sushi), dim=0)

final_label = final_label.type(torch.LongTensor)

In [55]:
dataset.item()["sushi"]

array([[   0,    0,    0, ..., 2456, 3507, 4682],
       [   0,    0,    0, ..., 1447, 3507, 4682],
       [   0,    0,    0, ..., 3166, 3507, 4682],
       ...,
       [   0,    0,    0, ..., 3124, 3507, 4682],
       [   0,    0,    0, ...,  216, 3507, 4682],
       [   0,    0,    0, ..., 4316, 3507, 4682]])

In [56]:
final_label.shape

torch.Size([2300])

In [57]:
final_dataset.shape

torch.Size([2300, 1210])

In [58]:
dataset_train, dataset_test, label_train, label_test = train_test_split(final_dataset,
                                                            final_label, test_size=0.1,
                                                                       random_state=1)

In [59]:
dataset_test.shape

torch.Size([230, 1210])

In [60]:
dataset_train.shape

torch.Size([2070, 1210])

### Dataset

In [9]:
class MyDataset(Dataset):
    
    def __init__(self, dataset, label):
        
        self.datasets = dataset
        self.labels = label

    def __getitem__(self, idx):
        return self.datasets[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)


### Model

In [10]:
class PoemClassifier(nn.Module):
    
    def __init__(self, words_num, embedding_size, hidden_size, classes, num_layers,
                    batch_size, sequence_length):
        super(PoemClassifier, self).__init__()
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.words_num = words_num
        self.sequence_length = sequence_length
        self.emb = nn.Embedding(words_num, embedding_size)
        self.LSTM = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, hidden=None):
        batch_size, sequence_length = x.shape # x batch_size, sequence_length
        if hidden is None:
            h, c = self.init_hidden(x, batch_size)
        else:
            h. c = hidden
        out = self.emb(x) # batch_size, sequence_length, embedding_size
#         out = out.transpose(1, 0) # sequence_length, batch_size, embedding_size
        out, hidden = self.LSTM(out, (h, c)) # sequence_length, batch_size, hidden_size
#         out = out.view(self.batch_size, -1)
        out = out[:, -1, :]
        out = self.fc1(out)
        return out, hidden

    def init_hidden(self, ipt, batch_size):
        h = ipt.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float()
        c = ipt.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float()
        h = Variable(h)
        c = Variable(c)
        return (h, c)


In [11]:
len(dataset_train[1])

1210

### trainer

In [19]:
batch_size = 32
epoch = 40
model = PoemClassifier(len(my_mapping), 128, 128, 2, 2, batch_size, 300)
optimizer = optim.Adam(model.parameters(), lr=5e-3, weight_decay=0.003)
criterion = nn.CrossEntropyLoss()
model = model.cuda()
datasets = MyDataset(dataset_train, label_train)
data_loader = DataLoader(dataset=datasets, batch_size=batch_size, shuffle=True, drop_last=True, 
                                    num_workers=4)

In [61]:
model = model.cuda()
dataset_test = dataset_test.cuda()
label_test = label_test.cuda()
for e in range(1, epoch + 1):
    for idx, item in enumerate(data_loader):
        data, labels = item
        data = data.cuda()
        labels = labels.cuda()
        h = None
        if idx == 0:
            out, h = model(data)
        else:
            out, h = model(data)
        loss = criterion(out, labels)
        p = (torch.sum(torch.max(out, dim=1)[1] == labels).item()) / batch_size
        acc = sum(torch.max(model(dataset_test.detach())[0], dim=1)[1] == label_test).item() / len(dataset_test)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Epoch [{e}/{epoch}] step [{idx + 1}/{len(data_loader)}] loss = {loss.item()} accuracy = {p} val_acc = {round(acc, 4)}")

In [None]:
final_dataset = final_dataset.cuda()
final_label = final_label.cuda()
import sys

In [35]:
s = 198
sum(torch.max(model(dataset_test[s:s + batch_size])[0], dim=1)[1] == label_test[s:s + batch_size]).item() / len(dataset_test[s:s + batch_size])

0.9375

In [23]:
torch.save(model.state_dict(), "PoemClassify.pth")

In [30]:
len(dataset_test)

230