In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read data
data = pd.read_csv('PoetryFoundationData.csv')

# count number of non empty cells in column tags
data['Tags'].count()
# drop rows with empty cells in column tags
data = data.dropna(subset=['Tags'])

# make new df with poem and tags
data = data[['Poem', 'Tags']]

In [None]:
# DO NOT REMOVE OR MODIFY
class LossFun(nn.Module):
    def __init__(self,):
        super().__init__()

        self.loss = nn.NLLLoss(reduction='none')

    def forward(self, y_model, y_true, reduction='sum'):
        # y_model: B(atch) x T(okens) x V(alues)
        # y_true: B x T
        B, T, V = y_model.size()

        y_model = y_model.view(B * T, V)
        y_true = y_true.view(B * T,)

        loss_matrix = self.loss(y_model, y_true) # B*T

        if reduction == 'sum':
            return torch.sum(loss_matrix)
        elif reduction == 'mean':
            loss_matrix = loss_matrix.view(B, T)
            return torch.mean(torch.sum(loss_matrix, 1))
        else:
            raise ValueError('Reduction could be either `sum` or `mean`.')

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_emb, num_heads=8):
        super().__init__()

        # hyperparams
        self.D = num_emb
        self.H = num_heads

        # weights for self-attention
        self.w_k = nn.Linear(self.D, self.D * self.H)
        self.w_q = nn.Linear(self.D, self.D * self.H)
        self.w_v = nn.Linear(self.D, self.D * self.H)

        # weights for a combination of multiple heads
        self.w_c = nn.Linear(self.D * self.H, self.D)

    def forward(self, x, causal=True):
        # x: B(atch) x T(okens) x D(imensionality)
        B, T, D = x.size()

        # keys, queries, values
        k = self.w_k(x).view(B, T, self.H, D) # B x T x H x D
        q = self.w_q(x).view(B, T, self.H, D) # B x T x H x D
        v = self.w_v(x).view(B, T, self.H, D) # B x T x H x D

        k = k.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D
        q = q.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D
        v = v.transpose(1, 2).contiguous().view(B * self.H, T, D) # B*H x T x D

        k = k / (D**0.25) # scaling
        q = q / (D**0.25) # scaling

        # kq
        kq = torch.bmm(q, k.transpose(1, 2)) # B*H x T x T

        # if causal
        if causal:
            mask = torch.triu_indices(T, T, offset=1)
            kq[..., mask[0], mask[1]] = float('-inf')

        # softmax
        skq = F.softmax(kq, dim=2)

        # self-attention
        sa = torch.bmm(skq, v) # B*H x T x D
        sa = sa.view(B, self.H, T, D) # B x H x T x D
        sa = sa.transpose(1, 2) # B x T x H x D
        sa = sa.contiguous().view(B, T, D * self.H) # B x T x D*H

        out = self.w_c(sa) # B x T x D

        return out


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, num_emb, num_neurons, num_heads=4):
        super().__init__()

        # hyperparams
        self.D = num_emb
        self.H = num_heads
        self.neurons = num_neurons

        # components
        self.msha = MultiHeadSelfAttention(num_emb=self.D, num_heads=self.H)
        self.layer_norm1 = nn.LayerNorm(self.D)
        self.layer_norm2 = nn.LayerNorm(self.D)
        self.dropout = nn.Dropout(0.1)

        self.mlp = nn.Sequential(nn.Linear(self.D, self.neurons * self.D),
                                nn.GELU(),
                                nn.Linear(self.neurons * self.D, self.D))

    def forward(self, x, causal=True):
        x = self.layer_norm1(x)
        x_attn = self.msha(x, causal)
        x = x + self.dropout(x_attn)  
        
        x = self.layer_norm2(x)
        x_mlp = self.mlp(x)
        x = x + self.dropout(x_mlp)  

        return x

In [None]:
# DO NOT REMOVE OR MODIFY
class DecoderTransformer(nn.Module):
    def __init__(self, num_tokens, num_token_vals, num_emb, num_neurons, num_heads=2, dropout_prob=0.1, num_blocks=10, device='cpu'):
        super().__init__()

        # hyperparams
        self.device = device
        self.num_tokens = num_tokens
        self.num_token_vals = num_token_vals
        self.num_emb = num_emb
        self.num_blocks = num_blocks

        # embedding layer
        self.embedding = torch.nn.Embedding(num_token_vals, num_emb)

        # positional embedding
        self.positional_embedding = nn.Embedding(num_tokens, num_emb)

        # transformer blocks
        self.transformer_blocks = nn.ModuleList()
        for _ in range(num_blocks):
            self.transformer_blocks.append(TransformerBlock(num_emb=num_emb, num_neurons=num_neurons, num_heads=num_heads))

        # output layer (logits + softmax)
        self.logits = nn.Sequential(nn.Linear(num_emb, num_token_vals))

        # dropout layer
        self.dropout = nn.Dropout(dropout_prob)

        # loss function
        self.loss_fun = LossFun()

    def transformer_forward(self, x, causal=True, temperature=1.0):
        # x: B(atch) x T(okens)
        # embedding of tokens
        x = self.embedding(x) # B x T x D
        # embedding of positions
        pos = torch.arange(0, x.shape[1], dtype=torch.long).unsqueeze(0).to(self.device)
        pos_emb = self.positional_embedding(pos)
        # dropout of embedding of inputs
        x = self.dropout(x + pos_emb)

        # transformer blocks
        for i in range(self.num_blocks):
            x = self.transformer_blocks[i](x)

        # output logits
        out = self.logits(x)

        return F.log_softmax(out/temperature, 2)

    @torch.no_grad()
    def sample(self, batch_size=4, temperature=1.0):
        x_seq = np.asarray([[self.num_token_vals - 1] for i in range(batch_size)])

        # sample next tokens
        for i in range(self.num_tokens-1):
            xx = torch.tensor(x_seq, dtype=torch.long, device=self.device)
            # process x and calculate log_softmax
            x_log_probs = self.transformer_forward(xx, temperature=temperature)
            # sample i-th tokens
            x_i_sample = torch.multinomial(torch.exp(x_log_probs[:,i]), 1).to(self.device)
            # update the batch with new samples
            x_seq = np.concatenate((x_seq, x_i_sample.to('cpu').detach().numpy()), 1)

        return x_seq

    @torch.no_grad()
    def top1_rec(self, x, causal=True):
        x_prob = torch.exp(self.transformer_forward(x, causal=True))[:,:-1,:].contiguous()
        _, x_rec_max = torch.max(x_prob, dim=2)
        return torch.sum(torch.mean((x_rec_max.float() == x[:,1:].float().to(device)).float(), 1).float())

    def forward(self, x, causal=True, temperature=1.0, reduction='mean'):
        # get log-probabilities
        log_prob = self.transformer_forward(x, causal=causal, temperature=temperature)

        return self.loss_fun(log_prob[:,:-1].contiguous(), x[:,1:].contiguous(), reduction=reduction)

In [None]:
# DO NOT REMOVE OR MODIFY
def evaluation(test_loader, name=None, model_best=None, epoch=None, device='cuda'):
    # EVALUATION
    if model_best is None:
        # load best performing model
        model_best = torch.load(name + '.model').to(device)

    model_best.eval()
    loss = 0.
    rec = 1.
    N = 0.
    for indx_batch, test_batch in enumerate(test_loader):
        loss_t = model_best.forward(test_batch.to(device), reduction='sum')
        loss = loss + loss_t.item()

        rec_t = model_best.top1_rec(test_batch.to(device))
        rec = rec + rec_t.item()

        N = N + test_batch.shape[0]
    loss = loss / N
    rec = rec / N

    if epoch is None:
        print(f'FINAL LOSS: nll={loss}, rec={rec}')
    else:
        print(f'Epoch: {epoch}, val nll={loss}, val rec={rec}')

    return loss, rec

def plot_curve(name, nll_val, ylabel='nll'):
    plt.plot(np.arange(len(nll_val)), nll_val, linewidth='3')
    plt.xlabel('epochs')
    plt.ylabel(ylabel)
    plt.savefig(name + '_' + ylabel + '_val_curve.pdf', bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
# DO NOT REMOVE OR MODIFY
def training(name, max_patience, num_epochs, model, optimizer, training_loader, val_loader, device='cuda'):
    nll_val = []
    rec_val = []
    best_nll = 1000.
    patience = 0

    # Main loop
    for e in range(num_epochs):
        # TRAINING
        model.train()
        for indx_batch, batch in enumerate(training_loader):
            loss = model.forward(batch.to(device))

            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()

        # Validation
        loss_val, r_val = evaluation(val_loader, model_best=model, epoch=e, device=device)
        nll_val.append(loss_val)  # save for plotting
        rec_val.append(r_val)

        if e == 0:
            print('saved!')
            torch.save(model, name + '.model')
            best_nll = loss_val

            sampled_tokens = model.sample(batch_size=64, temperature=1.0)
            sampled_texts = tokenizer.decode(sampled_tokens)
            save_texts(sampled_texts, name='epoch_' + str(e))

        else:
            if loss_val < best_nll:
                print('saved!')
                torch.save(model, name + '.model')
                best_nll = loss_val
                patience = 0

                sampled_tokens = model.sample(batch_size=64, temperature=1.0)
                sampled_texts = tokenizer.decode(sampled_tokens)
                save_texts(sampled_texts, name='epoch_' + str(e))
            else:
                patience = patience + 1

        if patience > max_patience:
            break

    nll_val = np.asarray(nll_val)
    rec_val = np.asarray(rec_val)

    np.save(name + '_nll_val.npy', nll_val)
    np.save(name + '_rec_val.npy', rec_val)

    return nll_val, rec_val

In [None]:
# PLEASE MODIFY ACCORDING TO THE REPORT REQUIREMENTS
num_training_data = None  # None to take all training data

# DO NOT REMOVE OR MODIFY THE REST OF THIS CELL
#-dataset
train_dataset = Headers(dataprocessor, tokenizer, num_training_data=num_training_data, mode="train")
validation_dataset = Headers(dataprocessor, tokenizer, mode="val")
test_dataset = Headers(dataprocessor, tokenizer, mode="test")

#-dataloaders
BATCH_SIZE = 32

training_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# DO NOT REMOVE but PLEASE MODIFY WHENEVER YOU ARE ASKED FOR IT!
# NOTE: in order to obtain required sizes of your models, you can play with
#       various values of num_neurons, num_heads, num_blocks, num_emb
num_tokens = 150 # do not modify!
num_token_vals = 64  # do not modify!
num_neurons = 32 # please modify it
num_heads = 8 # please modify it
num_blocks = 4 # please modify it
num_emb = num_heads * 4  # please modify it but it must be a multiplication of num_heads
causal=True # do not modify!

lr = 1e-3 # learning rate; do not modify!
num_epochs = 1000 # max. number of epochs; do not modify!
max_patience = 10 # an early stopping is used, if training doesn't improve for longer than 20 epochs, it is stopped; do not modify!

In [None]:
# DO NOT REMOVE OR MODIFY
model = DecoderTransformer(num_tokens=num_tokens, num_token_vals=num_token_vals, num_emb=num_emb, num_neurons=num_neurons, num_heads=num_heads, num_blocks=num_blocks, device=device)
model = model.to(device)
# Print the summary (like in Keras)
print(summary(model, torch.zeros(1, num_tokens, dtype=torch.long).to(device), show_input=False, show_hierarchical=False))

In [None]:
# DO NOT REMOVE OR MODIFY
# Training procedure
nll_val, rec_val = training(name=results_dir + name, max_patience=max_patience, num_epochs=num_epochs, model=model, optimizer=optimizer, training_loader=training_loader, val_loader=val_loader, device=device)

In [None]:
# DO NOT REMOVE OR MODIFY
# Final evaluation
test_loss, test_rec = evaluation(name=results_dir + name, test_loader=test_loader, device=device)

with open(results_dir + name + '_test_loss.txt', "w") as f:
    f.write('Test NLL: ' + str(test_loss)+'\n'+'Test REC: ' + str(test_rec))
    f.close()

plot_curve(results_dir + name, nll_val, ylabel='nll')
plot_curve(results_dir + name, rec_val, ylabel='rec')

In [None]:
# DO NOT REMOVE
# Sample texts: load best model
model_best = torch.load(results_dir + name + '.model')
model_best = model_best.eval()

# sample
temperature = 1.0 # you can modify it
num_samples = 64 # you can modify it

sampled_tokens = model_best.sample(batch_size=num_samples, temperature=temperature)  # do not modify
sampled_texts = tokenizer.decode(sampled_tokens)  # do not modify
print(sampled_texts)

save_texts(sampled_texts, name='FINAL_' + str(temperature))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# read data
data = pd.read_csv('PoetryFoundationData.csv')
data.head(15)

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\n\r\n Objects Used to Pr...,"\r\n\r\nDog bone, stapler,\r\n\r\ncribbage boa...",Michelle Menting,
1,1,\r\n\r\n The New Church\r\n...,\r\n\r\nThe old cupola glinted above the cloud...,Lucia Cherciu,
2,2,\r\n\r\n Look for Me\r\n\r\...,\r\n\r\nLook for me under the hood\r\n\r\nof t...,Ted Kooser,
3,3,\r\n\r\n Wild Life\r\n\r\n ...,"\r\n\r\nBehind the silo, the Mother Rabbit\r\n...",Grace Cavalieri,
4,4,\r\n\r\n Umbrella\r\n\r\n ...,\r\n\r\nWhen I push your button\r\n\r\nyou fly...,Connie Wanek,
5,5,\r\n\r\n Sunday\r\n\r\n ...,\r\n\r\nYou are the start of the week\r\n\r\no...,January Gill O'Neil,
6,6,\r\n\r\n Invisible Fish\r\n...,\r\n\r\nInvisible fish swim this ghost ocean n...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\r\n\r\n Don’t Bother the E...,\r\n\r\nDon’t bother the earth spirit who live...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
8,8,\r\n\r\n The One Thing That...,\r\n\r\nIs anything central?\r\n\r\nOrchards f...,John Ashbery,
9,9,"\r\n\r\n [""Hour in which I ...","\r\n\r\nHour in which I consider hydrangea, a ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."


In [None]:
# count number of non empty cells in column tags
data['Tags'].count()

12899

In [None]:
# drop rows with empty cells in column tags
data = data.dropna(subset=['Tags'])
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
6,6,\r\n\r\n Invisible Fish\r\n...,\r\n\r\nInvisible fish swim this ghost ocean n...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\r\n\r\n Don’t Bother the E...,\r\n\r\nDon’t bother the earth spirit who live...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
9,9,"\r\n\r\n [""Hour in which I ...","\r\n\r\nHour in which I consider hydrangea, a ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."
16,16,\r\n\r\n scars\r\n\r\n ...,\r\n\r\nmy father’s body is a map\r\n\r\na rec...,Truong Tran,"The Body,Family & Ancestors"
17,17,\r\n\r\n what remains two\r...,\r\n\r\nit has long been forgotten this practi...,Truong Tran,"Infancy,Parenthood,The Body"


In [None]:
# make new df with poem and tags
data = data[['Poem', 'Tags']]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# read data
data = pd.read_csv('PoetryFoundationData.csv')
data.head(15)

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\n\r\n Objects Used to Pr...,"\r\n\r\nDog bone, stapler,\r\n\r\ncribbage boa...",Michelle Menting,
1,1,\r\n\r\n The New Church\r\n...,\r\n\r\nThe old cupola glinted above the cloud...,Lucia Cherciu,
2,2,\r\n\r\n Look for Me\r\n\r\...,\r\n\r\nLook for me under the hood\r\n\r\nof t...,Ted Kooser,
3,3,\r\n\r\n Wild Life\r\n\r\n ...,"\r\n\r\nBehind the silo, the Mother Rabbit\r\n...",Grace Cavalieri,
4,4,\r\n\r\n Umbrella\r\n\r\n ...,\r\n\r\nWhen I push your button\r\n\r\nyou fly...,Connie Wanek,
5,5,\r\n\r\n Sunday\r\n\r\n ...,\r\n\r\nYou are the start of the week\r\n\r\no...,January Gill O'Neil,
6,6,\r\n\r\n Invisible Fish\r\n...,\r\n\r\nInvisible fish swim this ghost ocean n...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\r\n\r\n Don’t Bother the E...,\r\n\r\nDon’t bother the earth spirit who live...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
8,8,\r\n\r\n The One Thing That...,\r\n\r\nIs anything central?\r\n\r\nOrchards f...,John Ashbery,
9,9,"\r\n\r\n [""Hour in which I ...","\r\n\r\nHour in which I consider hydrangea, a ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."


In [None]:
# count number of non empty cells in column tags
data['Tags'].count()

12899

In [None]:
# drop rows with empty cells in column tags
data = data.dropna(subset=['Tags'])
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
6,6,\r\n\r\n Invisible Fish\r\n...,\r\n\r\nInvisible fish swim this ghost ocean n...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\r\n\r\n Don’t Bother the E...,\r\n\r\nDon’t bother the earth spirit who live...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
9,9,"\r\n\r\n [""Hour in which I ...","\r\n\r\nHour in which I consider hydrangea, a ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."
16,16,\r\n\r\n scars\r\n\r\n ...,\r\n\r\nmy father’s body is a map\r\n\r\na rec...,Truong Tran,"The Body,Family & Ancestors"
17,17,\r\n\r\n what remains two\r...,\r\n\r\nit has long been forgotten this practi...,Truong Tran,"Infancy,Parenthood,The Body"


In [None]:
# make new df with poem and tags
data = data[['Poem', 'Tags']]