In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torch

In [7]:
# df = pd.read_csv('poet.tang.csv')
df = pd.read_csv('60tang.csv')
df_dict = pd.read_csv('tang.dict.csv')

In [15]:
wordlist = []

for s in df.poets:
    s = s.replace("'", "").replace(", ", "").replace("[", "").replace("]", "")
    s = s.replace("，", "").replace("。", "").replace("《", "").replace("》", "").replace("/", "")
    wordlist.extend(list(s))

wordcnt = {}    
word2id = {}
id2word = {}

i = 1
for c in wordlist:
    if c not in word2id:
        word2id[c] = i
        id2word[i] = c
        wordcnt[c] = 1
    else:
        wordcnt[c] +=1
    
wordlist = list(set(wordlist))
wordlist.sort()


In [17]:
word2id = {}
id2word = {}

for i in range(len(wordlist)):
    word2id[wordlist[i]] = i + 1
    id2word[i + 1] = wordlist[i]

In [13]:
def calclen(s):
    s = s.replace("'", "").replace(", ", "").replace("[", "").replace("]", "")
    s = s.replace("，", "").replace("。", "").replace("《", "").replace("》", "").replace("/", "")
    return len(s)

df['lenth'] = df.poets.map(calclen)

In [19]:
len(wordlist)

7131

In [119]:
MAX_LEN = 20

class PoetDataset(Dataset):

    def __init__(self, df):
        self.df = df.copy()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        s = self.df.iloc[idx]['poets']
        
        s = s.replace("'", "").replace(", ", "").replace("[", "").replace("]", "")
        s = s.replace("，", "").replace("。", "").replace("《", "").replace("》", "").replace("/","")
        d = [word2id[w] for w in s]
        if len(d) == 0:
            print(idx, self.df.iloc[idx]['poets'])
            
        t = Variable(torch.from_numpy(np.array(d[1:]).copy()))
        '''
        data = np.zeros((len(t), MAX_LEN))
        for i in range(len(t)):
            data[i, MAX_LEN-(i+1):] = d[:i+1]
        d = Variable(torch.from_numpy(data))
        '''    
        # d = Variable(torch.from_numpy(np.array(d[:len(d)-1])))
        
        data = []
        
        for i in range(len(t)):
            data.append(d[:i+1][-20:])
            
        d = data
        
        # print("t", t.size(), "d", d.size())
        return {'text': d, 'label': t}
    
poet_dataset = PoetDataset(df)
data_loader = DataLoader(poet_dataset, batch_size=1, shuffle=True, num_workers=0)

In [122]:
class PoetryModel(nn.Module):
    
    def __init__(self, vocab, hidden_size, n_cat, bs=1, nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = nn.Embedding(n_vocab, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, nl)
        self.fc2 = nn.Linear(hidden_size, n_cat)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, inp):
        bs = len(inp)
        # print(bs)
        e_out = None
        
        x = inp
        for j in range(len(x)):
            x1 = Variable(torch.from_numpy(np.array(x[j]))).long().cuda()
            eout = self.e(x1)
            tmp = torch.zeros(MAX_LEN, self.hidden_size)
            # print(tmp.size(), eout.size())
            tmp[MAX_LEN-eout.size(0):, :] = eout

            if e_out is None:
                e_out = torch.cat([tmp])
            else:
                e_out = torch.cat([e_out, tmp])
            # print('e_out ', e_out.size(), eout.size())
                    
        e_out = e_out.view(bs, MAX_LEN, self.hidden_size).cuda()            
        # print('e_out ', e_out.size())

        
        h0 = c0 = Variable(e_out.data.new(*(self.nl, MAX_LEN, self.hidden_size)).zero_())
        # print('h0 c0', h0.size(), c0.size())
        rnn_o,_ = self.lstm(e_out, (h0, c0)) 
        # print('rnn_o', rnn_o.size())
        rnn_o = rnn_o[:, -1, :]
        # print('rnn_o', rnn_o.size())
        fc = F.dropout(self.fc2(rnn_o), p=0.8)
        # print('fc', fc.size())
        out = self.softmax(fc)
        # print('out', out.size())
        return out

n_vocab = len(wordlist)
n_hidden = 256
n_cat = n_vocab
bs = 128

model = PoetryModel(n_vocab, n_hidden, n_cat, bs=32)
model = model.cuda()

In [114]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def fit(epoch, model, data_loader, phase='training', volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
    running_loss = 0.0
    running_correct = 0
    for batch_idx, batch in enumerate(data_loader):
        text, target = batch['text'], batch['label']
        # text, target = text.long().cuda(), target.long().cuda()
        target = target.long().cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)

        target = target.view(-1)

        loss = loss_fn(output, target)
        
        running_loss += loss.item()
        
        preds = output.data.max(dim=1, keepdim=True)[1]
        
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct/len(data_loader.dataset)
    
    print(f'{epoch} {phase} loss: {loss:.6f}, accuracy: {accuracy:.4f} ... {running_correct}/{len(data_loader.dataset)} - ')
    
    return loss, accuracy

In [None]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]
patient = 0
best_loss = 7**7
for epoch in range(1, 500):

    epoch_loss, epoch_accuracy = fit(epoch, model, data_loader, phase='training')
    # val_epoch_loss , val_epoch_accuracy = fit(epoch, model, data_loader, phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)
    
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        print('best_loss: ', best_loss)
        patient = 0
    else:
        patient += 1
        if patient > 5:
            break
        

In [216]:
torch.save(model.state_dict(), './model.pt')

In [53]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(1000, 128)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([3,4])
hello_embed = embeds(lookup_tensor)
print(hello_embed, hello_embed.size())

tensor([[-0.4402, -0.7088,  0.0667, -1.7056, -0.1849,  0.4807,  2.5024, -1.6402,
          0.1469, -0.1713, -1.4745,  1.5302, -1.0410,  0.0351,  2.9185, -1.7908,
          0.5428, -0.8813, -0.2174,  0.3891,  0.9730,  0.8021, -0.1282,  1.7987,
         -1.3707, -0.1314,  0.2632, -0.5971,  1.3801, -0.0646,  1.6698, -0.2297,
         -0.7425, -0.8525,  1.1486, -1.0557, -1.0240, -0.6615,  0.0601, -0.5494,
         -0.4581, -1.5214, -0.9249,  0.3678, -0.2644,  1.4730,  1.6285,  0.4550,
         -0.3198,  0.6505, -0.7824,  1.2003, -0.0090, -0.6278, -0.0644, -0.5774,
          1.3314, -0.9830,  0.1102, -0.9747, -0.2678, -0.2209, -0.4732, -0.2898,
          0.1829,  0.1117,  0.9074, -1.1729, -0.7261,  0.5295, -0.9335, -0.7991,
         -0.6180, -0.8068,  0.5029,  0.1366, -0.7488, -0.6100,  0.8735, -0.6095,
          0.3592,  1.3576,  0.4937, -1.5541, -1.0082, -0.4329,  0.9803,  0.1081,
          0.2848,  0.4500, -0.7060, -1.2691, -0.9035, -1.9650,  0.3038,  0.6682,
          1.6333,  0.3349, -

In [56]:
lookup_tensor.size()

torch.Size([2])