In [46]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
from torch.nn import TransformerDecoderLayer


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.input_embed = nn.Embedding(ntoken, ninp)
        self.output_embed = nn.Embedding(ntoken,ninp)
        self.ninp = ninp
        decoder_layers = TransformerDecoderLayer(ninp,nhead,nhid,dropout)
        self.decoder = TransformerDecoder(decoder_layers,nlayers)
        self.linear = nn.Linear(ninp,ntoken)
        self.init_weights()


    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.input_embed.weight, -initrange, initrange)
        nn.init.uniform_(self.output_embed.weight, -initrange, initrange)
        nn.init.zeros_(self.linear.weight)
        nn.init.uniform_(self.linear.weight, -initrange, initrange)

    def forward(self, src, tgt, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_embed(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        tgt = self.output_embed(tgt) * math.sqrt(self.ninp)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(tgt,output)
        output = self.linear(output)
        return F.log_softmax(output, dim=-1)


import io
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import json
import spacy


trainingJson = json.load(open("twitter_training.json"))
dataSetSize = len(trainingJson["data"])

print(dataSetSize)

#generate tokenizer for entire dataset

tokenizer = get_tokenizer('basic_english')

counter = Counter()

max_vocab = 50
vlen = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

counter.update(tokenizer("<pad>"))
counter.update(tokenizer("<hist>"))
counter.update(tokenizer("<ehist>"))

print(counter)

for user in trainingJson["data"]:
    if vlen > max_vocab:
        break
    vlen += 1
    counter.update(tokenizer(" <bio> "+user["bio"]+" <ebio> "))
    for act in user["activity"]:
        counter.update(tokenizer(act))

def data_process(text):
    data =[]

    token = tokenizer(text)

    for tok in token:
        ch = vocab[tok]

        data.append(ch)


    batch = []
    batch.append(data)

    return torch.tensor(batch, dtype=torch.long).view(-1,1)


def get_batch(x,l):

    user_bio = trainingJson["data"][x]["bio"]
    src = "<hist> "

    for i in range(0,min(len(trainingJson["data"][x]["activity"]),l)):
        src += " " +trainingJson["data"][x]["activity"][i]

    src += " <ehist>"

    target = "<bio> "  + user_bio + " <ebio>"
    return data_process(src).to(device), data_process(target).to(device)


vocab = Vocab(counter)

import math

import torch
import torch.nn as nn

lr = 5.0 # learning rate

ntokens = len(vocab.stoi)  # the size of vocabulary
emsize = 400  # embedding dimension
nhid = 400  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 5  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4  # the number of heads in the multiheadattention models
dropout = 0.2  # the dropout value

model = TransformerModel(ntokens, emsize, nhead, 200, nlayers, dropout).to(device)

#criterion = nn.NLLLoss()
#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
def train(iterations,historyLength):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.

    for batch  in range(0,iterations):
        data, targets = get_batch(batch, historyLength)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()


        output = model(data,targets,True)

        loss = criterion(output.view(1,-1,ntokens)[0], targets.view(1,-1)[0])

        loss.backward()

        print(loss.item())
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        for p in model.parameters():
             p.data.add_(p.grad, alpha=-lr)




442
Counter({'<pad>': 1, '<hist>': 1, '<ehist>': 1})


In [47]:
train(dataSetSize,5)

9.592676162719727
8.743864059448242
10.279674530029297
14.5575532913208
11.562386512756348
10.057778358459473
11.939125061035156
13.054834365844727
10.674249649047852
13.130245208740234
10.399569511413574
13.728581428527832
11.58181381225586
15.40800952911377
13.401955604553223
13.118803024291992
10.925163269042969
14.205254554748535
9.024247169494629
4.58830451965332
8.511165618896484
12.859786033630371
10.718027114868164
11.457335472106934
8.701224327087402
9.742143630981445
8.325794219970703
7.63382625579834
8.61167049407959
8.151487350463867
9.741929054260254
11.45905590057373
7.551595211029053
11.16312026977539
8.679938316345215
8.41867446899414
10.202478408813477
9.193794250488281
15.240636825561523
8.359511375427246
9.856460571289062
8.11585807800293
11.71518325805664
8.847254753112793
7.6978535652160645
9.28496265411377
9.529285430908203
7.516508102416992
7.656787872314453
8.534514427185059
9.829153060913086
8.885416984558105
5.668976783752441
11.928962707519531
7.8131995201110

In [48]:
def get_input(x,l):
    user_acts = ""
    for i in range(0,min(len(trainingJson["data"][x]["activity"]),l)):
        user_acts += " " +trainingJson["data"][x]["activity"][i]
    user_acts += " <bio> "
    return data_process(user_acts).to(device)



print(dataSetSize)

442


In [57]:
input,_ = get_batch(5,15)

bio = "<bio> "
temp = 1.5
start = data_process(bio)

for _ in range(100):
    with torch.no_grad():
         output = model(input,start,False)
         word_weights = output[-1].squeeze().div(temp).exp().cpu()
         word_idx = torch.multinomial(word_weights, 1)[0]
         word_tensor = torch.Tensor([[word_idx]]).long().to(device)
         start = torch.cat([start, word_tensor], 0)
         word = vocab.itos[word_idx]
         bio += " " + word
         if word == "<ebio>":
             break

print(bio)


<bio>  conversation observed <unk> advisory @gearsofwar co/newcukyovf unterrichten @bloodextractor through $4 women immensely co/fzshs58p0l himbos <unk> jamaica ve @mahxism ceremony @newbookspoetry @thr @denimcatfish 💚 challenge recounts define co/mz0yyovhtt co/mbfw1m7lnn nyu <unk> pharma god <unk> soviet refusal oral mentioning <unk> se… abbott <unk> gewalt @ae2501maeth rejoined co/81533723sq 25th summar… @catgirlfingies by variants cis second appreciate @madu088 co/zt5gt50wvo jobs” ruins opinions direct lately @autumngxi t- flawed sources care again vaxxine @imdb stay weekly afford @cherry_viper_ say <ebio>
