In [335]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

In [336]:
# read data
data = pd.read_csv('PoetryFoundationData.csv')

# count number of non empty cells in column tags
data['Tags'].count()
# drop rows with empty cells in column tags
data = data.dropna(subset=['Tags'])

# make new df with poem and tags
data = data[['Poem', 'Tags']]

In [337]:
# find number of unique words in the vocabulary in train_data.csv
words = []
for i in range(len(data)):
    words.extend(data['Poem'].iloc[i].split())
words = list(set(words))
vocab_size = len(words)
vocab_size
words


['river?,',
 'blether,',
 'sojourned',
 'gracefully',
 'patient—his',
 'dryad.',
 'Archimedes,',
 'Heart”',
 'Lawyers,',
 'Tras',
 'cat’s.',
 'background.',
 'phosphorescence.',
 'finger-pointing',
 '(whose?)',
 'gibbetOf',
 'it.Once',
 'stormthat',
 'tribe:',
 'sonnet',
 'suede',
 'sun-porch',
 'melancholique',
 'money,though',
 '“fiddle-de-Dutch”?',
 'Cunliffe',
 '(Keeping',
 'militantly',
 'Lie,his',
 'bettered:',
 'if—(lest',
 'whiftswhifts',
 'compelled,',
 'lunatuna',
 'embankments.',
 'Toby',
 'brassy.”',
 '‘Boy,',
 'scintillating,',
 'stories—',
 'bubbles.',
 'prideburning',
 'areft',
 'bold-faced',
 'Juliet',
 'inkwell.',
 'Winchelsea,',
 'BowersGrace',
 'blowzy-haired',
 'careering,',
 'sinners,',
 'thine,”',
 'Fell,',
 'Godwards',
 'contents:small',
 'perished.Had',
 'know.The',
 'Adagios',
 "fear.There's",
 'sires,',
 'innards.Fling',
 'guns—O',
 'What’ll',
 'millihelen',
 'Vaya',
 'gilded,',
 'fronds.II',
 'all-powerful',
 'punishedwith',
 'dye.So',
 'stayed,Developing,',


In [338]:

class LossFun(nn.Module):
    def __init__(self,):
        super().__init__()

        self.loss = nn.NLLLoss(reduction='none')

    def forward(self, y_model, y_true, reduction='sum'):
        # y_model: B(atch) x T(okens) x V(alues)
        # y_true: B x T
        B, T, V = y_model.size()

        y_model = y_model.view(B * T, V)
        y_true = y_true.view(B * T,)

        loss_matrix = self.loss(y_model, y_true) # B*T

        if reduction == 'sum':
            return torch.sum(loss_matrix)
        elif reduction == 'mean':
            loss_matrix = loss_matrix.view(B, T)
            return torch.mean(torch.sum(loss_matrix, 1))
        else:
            raise ValueError('Reduction could be either `sum` or `mean`.')

In [339]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_emb, num_heads=8):
        super().__init__()

        # hyperparams
        self.D = num_emb
        self.H = num_heads

        # weights for self-attention
        self.w_k = nn.Linear(self.D, self.D * self.H)
        self.w_q = nn.Linear(self.D, self.D * self.H)
        self.w_v = nn.Linear(self.D, self.D * self.H)

        # weights for a combination of multiple heads
        self.w_c = nn.Linear(self.D * self.H, self.D)

    def forward(self, x, causal=True):
        # x: B(atch) x T(okens) x D(imensionality)
        B, T, D = x.size()
        print('B: ', B)
        print('T: ', T)
        print('D: ', D)

        # keys, queries, values
        k = self.w_k(x).view(B, T, self.H, D) # B x T x H x D
        q = self.w_q(x).view(B, T, self.H, D) # B x T x H x D
        v = self.w_v(x).view(B, T, self.H, D) # B x T x H x D

        k = k.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D
        q = q.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D
        v = v.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D

        k = k / (D**0.25) # scaling
        q = q / (D**0.25) # scaling

        # kq
        kq = torch.bmm(q, k.transpose(1, 2)) # B*H x T x T

        # if causal
        if causal:
            mask = torch.triu_indices(T, T, offset=1)
            kq[..., mask[0], mask[1]] = float('-inf')

        # softmax
        skq = F.softmax(kq, dim=2)

        # self-attention
        sa = torch.bmm(skq, v) # B*H x T x D
        sa = sa.view(B, self.H, T, D) # B x H x T x D
        sa = sa.transpose(1, 2) # B x T x H x D
        sa = sa.contiguous().view(B, T, D * self.H) # B x T x D*H

        out = self.w_c(sa) # B x T x D

        return out


In [340]:
class TransformerBlock(nn.Module):
    def __init__(self, num_emb, num_neurons, num_heads=4):
        super().__init__()

        # hyperparams
        self.D = num_emb
        self.H = num_heads
        self.neurons = num_neurons

        # components
        self.msha = MultiHeadSelfAttention(num_emb=self.D, num_heads=self.H)
        self.layer_norm1 = nn.LayerNorm(self.D)
        self.layer_norm2 = nn.LayerNorm(self.D)

        self.mlp = nn.Sequential(nn.Linear(self.D, self.neurons * self.D),
                                nn.ReLU(),
                                nn.Linear(self.neurons * self.D, self.D))

    def forward(self, x, causal=True):
        # Multi-Head Self-Attention
        x_attn = self.msha(x, causal)
        # LayerNorm
        x = self.layer_norm1(x_attn + x)
        # MLP
        x_mlp = self.mlp(x)
        # LayerNorm
        x = self.layer_norm2(x_mlp + x)
        return x

In [341]:

class DecoderTransformer(nn.Module):
    def __init__(self, num_tokens, num_token_vals, num_emb, num_neurons, num_heads=2, dropout_prob=0.1, num_blocks=10, device='cpu'):
        super().__init__()

        # hyperparams
        self.device = device
        self.num_tokens = num_tokens
        self.num_token_vals = num_token_vals
        self.num_emb = num_emb
        self.num_blocks = num_blocks

        # embedding layer
        self.embedding = torch.nn.Embedding(num_token_vals, num_emb)

        # positional embedding
        self.positional_embedding = nn.Embedding(num_tokens, num_emb)

        # transformer blocks
        self.transformer_blocks = nn.ModuleList()
        for _ in range(num_blocks):
            self.transformer_blocks.append(TransformerBlock(num_emb=num_emb, num_neurons=num_neurons, num_heads=num_heads))

        # output layer (logits + softmax)
        self.logits = nn.Sequential(nn.Linear(num_emb, num_token_vals))

        # dropout layer
        self.dropout = nn.Dropout(dropout_prob)

        # loss function
        self.loss_fun = LossFun()

    def transformer_forward(self, x, causal=True, temperature=1.0):
        # x: B(atch) x T(okens)
        # embedding of tokens
        x = self.embedding(x) # B x T x D
        print(x)
        # embedding of positions
        pos = torch.arange(0, x.shape[1], dtype=torch.long).unsqueeze(0).to(self.device)
        print('pos: ', pos)
        pos_emb = self.positional_embedding(pos)
        # dropout of embedding of inputs
        x = self.dropout(x + pos_emb)

        # transformer blocks
        for i in range(self.num_blocks):
            x = self.transformer_blocks[i](x)

        # output logits
        out = self.logits(x)

        return F.log_softmax(out/temperature, 2)

    @torch.no_grad()
    def sample(self, batch_size=4, temperature=1.0):
        x_seq = np.asarray([[self.num_token_vals - 1] for i in range(batch_size)])

        # sample next tokens
        for i in range(self.num_tokens-1):
            xx = torch.tensor(x_seq, dtype=torch.long, device=self.device)
            # process x and calculate log_softmax
            x_log_probs = self.transformer_forward(xx, temperature=temperature)
            # sample i-th tokens
            x_i_sample = torch.multinomial(torch.exp(x_log_probs[:,i]), 1).to(self.device)
            # update the batch with new samples
            x_seq = np.concatenate((x_seq, x_i_sample.to('cpu').detach().numpy()), 1)

        return x_seq

    @torch.no_grad()
    def top1_rec(self, x, causal=True):
        x_prob = torch.exp(self.transformer_forward(x, causal=True))[:,:-1,:].contiguous()
        _, x_rec_max = torch.max(x_prob, dim=2)
        return torch.sum(torch.mean((x_rec_max.float() == x[:,1:].float().to(device)).float(), 1).float())

    def forward(self, x, causal=True, temperature=1.0, reduction='mean'):
        # get log-probabilities
        log_prob = self.transformer_forward(x, causal=causal, temperature=temperature)

        return self.loss_fun(log_prob[:,:-1].contiguous(), x[:,1:].contiguous(), reduction=reduction)

In [342]:
import matplotlib.pyplot as plt
def evaluation(test_loader, name=None, model_best=None, epoch=None, device='cuda'):
    # EVALUATION
    if model_best is None:
        # load best performing model
        model_best = torch.load(name + '.model').to(device)

    model_best.eval()
    loss = 0.
    rec = 1.
    N = 0.
    for indx_batch, test_batch in enumerate(test_loader):
        loss_t = model_best.forward(test_batch.to(device), reduction='sum')
        loss = loss + loss_t.item()

        rec_t = model_best.top1_rec(test_batch.to(device))
        rec = rec + rec_t.item()

        N = N + test_batch.shape[0]
    loss = loss / N
    rec = rec / N

    if epoch is None:
        print(f'FINAL LOSS: nll={loss}, rec={rec}')
    else:
        print(f'Epoch: {epoch}, val nll={loss}, val rec={rec}')

    return loss, rec

def plot_curve(name, nll_val, ylabel='nll'):
    plt.plot(np.arange(len(nll_val)), nll_val, linewidth='3')
    plt.xlabel('epochs')
    plt.ylabel(ylabel)
    plt.savefig(name + '_' + ylabel + '_val_curve.pdf', bbox_inches='tight')
    plt.show()
    plt.close()

In [343]:
import os
cwd = os.getcwd()
def save_texts(sampled_texts, name=''):
    # open file in write mode
    with open(cwd + '/samples_' + name + '.txt', 'w') as fp:
        for item in sampled_texts:
            # write each item in a new line
            fp.write("%s\n" % item)

In [344]:
from data import tokenizer

def training(name, max_patience, num_epochs, model, optimizer, training_loader, val_loader, device='cuda'):
    nll_val = []
    rec_val = []
    best_nll = 1000.
    patience = 0

    # Main loop
    for e in range(num_epochs):
        # TRAINING
        model.train()
        for indx_batch, batch in enumerate(training_loader):
            # Unpack the batch and move to the device
            # batch = batch[0].to(device, dtype=torch.long)
            print(f"Batch index: {indx_batch}, Batch data shape: {batch.shape}")
            loss = model.forward(batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        loss_val, r_val = evaluation(val_loader, model_best=model, epoch=e, device=device)
        nll_val.append(loss_val)  # save for plotting
        rec_val.append(r_val)

        if e == 0 or loss_val < best_nll:
            print('saved!')
            torch.save(model, name + '.model')
            best_nll = loss_val
            patience = 0

            sampled_tokens = model.sample(batch_size=64, temperature=1.0)
            sampled_texts = tokenizer.decode(sampled_tokens)
            save_texts(sampled_texts, name='epoch_' + str(e))
        else:
            patience += 1

        if patience > max_patience:
            break

    nll_val = np.asarray(nll_val)
    rec_val = np.asarray(rec_val)

    np.save(name + '_nll_val.npy', nll_val)
    np.save(name + '_rec_val.npy', rec_val)

    return nll_val, rec_val

In [345]:
tr = pd.read_csv('train_data.csv')
tr[:]['Tags']

0       love,nature,fall,arts & sciences,music,poetry ...
1       love,break-ups & vexed love,infatuation & crus...
2       living,separation & divorce,love,break-ups & v...
3                    social commentaries,race & ethnicity
4                   social commentaries,money & economics
                              ...                        
9024    living,life choices,parenthood,time & brevity,...
9025    living,coming of age,social commentaries,popul...
9026    the body,relationships,home life,nature,social...
9027    living,death,time & brevity,religion,god & the...
9028    living,disappointment & failure,love,infatuati...
Name: Tags, Length: 9029, dtype: object

In [346]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class Poems(Dataset):

    def __init__(self, dataprocessor, tokenizer, dataset, dataset_type, num_training_data=None, transforms=None):

        # PREPARE DATA
        if dataset_type == 'train':
            train_texts = dataprocessor.process_batch(dataset['Poem'] + dataset['Tags']) # list
            if num_training_data is None:
                self.data = torch.tensor(tokenizer.encode(train_texts)).long()
            else:
                self.data = torch.tensor(tokenizer.encode(train_texts)[:num_training_data]).long()
        elif dataset_type == 'val':
            validation_texts = dataprocessor.process_batch(dataset['Poem'] + dataset['Tags']) # list
            self.data = torch.tensor(tokenizer.encode(validation_texts)).long()
        else:  # 'test'
            test_texts = dataprocessor.process_batch(dataset['Poem'] + dataset['Tags']) # list
            self.data = torch.tensor(tokenizer.encode(test_texts)).long()

        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        if self.transforms:
            sample = self.transforms(sample)
        return sample

In [347]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from data import dataprocessor

num_training_data = None  # None to take all training data

# Load your data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Assuming dataprocessor and tokenizer are already defined and initialized
train_dataset = Poems(dataprocessor, tokenizer, dataset=train_data, dataset_type='train', num_training_data=num_training_data)
validation_dataset = Poems(dataprocessor, tokenizer, dataset=val_data, dataset_type='val')
test_dataset = Poems(dataprocessor, tokenizer, dataset=test_data, dataset_type='test')

# Set batch size
BATCH_SIZE = 32

# Create DataLoaders
training_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [348]:
for batch in training_loader:
    print(f"Batch shape: {batch.shape}")
    break
print(f"Dataset length: {training_loader.batch_size}")
print(f"Sample indices: {list(range(len(training_loader.dataset))[:10])}")

Batch shape: torch.Size([32, 1201])
Dataset length: 32
Sample indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [349]:
name = 'decoder_results'  # NOTE: if you run multiple experiments, you would overwrite results. Please modify this part if necessary.
results_dir = cwd + name + '/'
if not(os.path.exists(results_dir)):
  os.mkdir(results_dir)

  
num_tokens = 1201 # do not modify!
num_token_vals = 32  # do not modify!
num_neurons = 32 # please modify it
num_heads = 8 # please modify it
num_blocks = 4 # please modify it
num_emb = num_heads * 8  # please modify it but it must be a multiplication of num_heads
causal=True # do not modify!

lr = 1e-2 # learning rate; do not modify!
num_epochs = 400 # max. number of epochs; do not modify!
max_patience = 10 # an early stopping is used, if training doesn't improve for longer than 20 epochs, it is stopped; do not modify!

In [350]:
from pytorch_model_summary import summary

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DecoderTransformer(num_tokens=num_tokens, num_token_vals=num_token_vals, num_emb=num_emb, num_neurons=num_neurons, num_heads=num_heads, num_blocks=num_blocks, device=device)
model = model.to(device)

print(summary(model, torch.zeros(1, num_tokens, dtype=torch.long).to(device), show_input=False, show_hierarchical=False))

tensor([[[-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         ...,
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614]]],
       grad_fn=<EmbeddingBackward0>)
pos:  tensor([[   0,    1,    2,  ..., 1198, 1199, 1200]])
B:  1
T:  1201
D:  64
B:  1
T:  1201
D:  64
B:  1
T:  1201
D:  64
B:  1
T:  1201
D:  64
--------------------------------------------------------------------------
         Layer (type)        Output Shape         Param #     Tr. Param #
          Embedding-1       [1, 1201, 64]           2,048           2,048
          Embedding-2       [1, 1201, 64]          76,864          76,864
            Dropout-3       [1, 1201, 64]               0               0
   

In [351]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Training procedure
nll_val, rec_val = training(name=cwd + name, max_patience=max_patience, num_epochs=num_epochs, model=model, optimizer=optimizer, training_loader=training_loader, val_loader=val_loader, device=device)

Batch index: 0, Batch data shape: torch.Size([32, 1201])
tensor([[[-2.0313, -0.4334,  1.4051,  ..., -1.4786, -0.3206,  2.0472],
         [ 0.5097,  0.7616,  1.4846,  ..., -1.0104,  0.4153, -0.8346],
         [ 0.4373, -0.2146, -0.4008,  ...,  1.8021,  0.7182, -1.2288],
         ...,
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614]],

        [[-2.0313, -0.4334,  1.4051,  ..., -1.4786, -0.3206,  2.0472],
         [ 0.5097,  0.7616,  1.4846,  ..., -1.0104,  0.4153, -0.8346],
         [ 0.4373, -0.2146, -0.4008,  ...,  1.8021,  0.7182, -1.2288],
         ...,
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614],
         [-0.2336, -0.0331, -0.4799,  ...,  0.2089,  0.4676, -1.5614]],

        [[-2.0313, -0.4334,  1.4051,  ..., -1.4786, -0.3206

In [None]:

# Final evaluation
test_loss, test_rec = evaluation(name=cwd + name, test_loader=test_loader, device=device)

with open(cwd + name + '_test_loss.txt', "w") as f:
    f.write('Test NLL: ' + str(test_loss)+'\n'+'Test REC: ' + str(test_rec))
    f.close()

plot_curve(cwd + name, nll_val, ylabel='nll')
plot_curve(cwd + name, rec_val, ylabel='rec')

In [None]:
# DO NOT REMOVE
# Sample texts: load best model
model_best = torch.load(cwd + name + '.model')
model_best = model_best.eval()

# sample
temperature = 1.0 # you can modify it
num_samples = 31 # you can modify it

sampled_tokens = model_best.sample(batch_size=num_samples, temperature=temperature)  # do not modify
sampled_texts = tokenizer.decode(sampled_tokens)  # do not modify
print(sampled_texts)

save_texts(sampled_texts, name='FINAL_' + str(temperature))