In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
from torch.nn import TransformerDecoderLayer



class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.input_embed = nn.Embedding(ntoken, ninp)
        self.output_embed = nn.Embedding(ntoken,ninp)
        self.ninp = ninp
        decoder_layers = TransformerDecoderLayer(ninp,nhead,nhid,dropout)
        self.decoder = TransformerDecoder(decoder_layers,nlayers)
        self.linear = nn.Linear(ninp,ntoken)
        self.init_weights()


    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.input_embed.weight, -initrange, initrange)
        nn.init.uniform_(self.output_embed.weight, -initrange, initrange)
        nn.init.zeros_(self.linear.weight)
        nn.init.uniform_(self.linear.weight, -initrange, initrange)

    def forward(self, src, tgt, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_embed(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        tgt = self.output_embed(tgt) * math.sqrt(self.ninp)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(tgt,output)
        output = self.linear(output)
        return F.log_softmax(output, dim=-1)


import io
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import json
import spacy


trainingJson = json.load(open("twitter_training.json"))
dataSetSize = len(trainingJson["data"])

print(dataSetSize)

#generate tokenizer for entire dataset

tokenizer = get_tokenizer('basic_english')

counter = Counter()

max_vocab = 400
vlen = 0

device = torch.device("cpu")

counter.update(tokenizer("<pad>"))
counter.update(tokenizer("<hist>"))
counter.update(tokenizer("<ehist>"))
counter.update(tokenizer("<ebio>"))

print(counter)

for user in trainingJson["data"]:
    if vlen > max_vocab:
        break
    vlen += 1
    counter.update(tokenizer(" <bio> "+user["bio"]+" <ebio> "))
    for act in user["activity"]:
        counter.update(tokenizer(act))

def data_process(text):
    data =[]

    token = tokenizer(text)

    for tok in token:
        ch = vocab[tok]
        if(ch != 0):
            data.append(ch)


    batch = []
    batch.append(data)

    return torch.tensor(batch, dtype=torch.long).view(-1,1)

def get_batch(x,l):

    user_bio = trainingJson["data"][x]["bio"]
    src = "<hist> "

    for i in range(0,min(len(trainingJson["data"][x]["activity"]),l)):
        src += " " +trainingJson["data"][x]["activity"][i]

    src += " <ehist>"

    target = "<bio> "  + user_bio + " <ebio>"

    return data_process(src).to(device), data_process(target).to(device)


vocab = Vocab(counter)

import math

import torch
import torch.nn as nn

lr = 2 # learning rate

ntokens = len(vocab.stoi)  # the size of vocabulary
emsize = 400  # embedding dimension
nhid = 100 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 100  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8  # the number of heads in the multiheadattention models
dropout = 0.2  # the dropout value

model = TransformerModel(ntokens, emsize, nhead, 200, nlayers, dropout).to(device)

#criterion = nn.NLLLoss()
#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
def train(iterations,historyLength):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.

    for batch  in range(0,iterations):
        data, targets = get_batch(batch, historyLength)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()


        output = model(data,targets,True)

        loss = criterion(output.view(1,-1,ntokens)[0], targets.view(1,-1)[0])

        loss.backward()

        print(loss.item())
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        for p in model.parameters():
              p.data.add_(p.grad, alpha=-lr*0.5)




In [None]:
trainSize = (int)(.75 * dataSetSize)
#
evalIndex = 1 + trainSize
#
# train(trainSize,15)


model.load_state_dict(torch.load("./bigModel"))


In [None]:

for batch in range(dataSetSize - evalIndex):
    input,_ = get_batch(evalIndex + batch,15)

    bio = "<bio>"
    temp = 1.6
    start = data_process(bio)
    model.eval()
    for _ in range(40):
        with torch.no_grad():
             output = model(input,start,False)
             word_weights = output[-1].squeeze().div(temp).exp().cpu()
             word_idx = torch.multinomial(word_weights, 1)[0]
             word_tensor = torch.Tensor([[word_idx]]).long().to(device)
             start = torch.cat([start, word_tensor], 0)
             word = vocab.itos[word_idx]
             bio += " " + word
             if word == "<ebio>":
                 break

    print(bio)
