In [3]:
import torch
import torch.nn.functional as F
from collections import defaultdict
import random
import numpy as np

# Language Modeling

From the CMU course http://phontron.com/class/nn4nlp2017

I guess the following several examples are from lecture 2

In [53]:
# The length of the n-gram
N = 2
training_size = 10

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
device = torch.device("cpu")
w2i = defaultdict(lambda: torch.tensor(len(w2i), device=device))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            if i > training_size:
                break
            print i, line.strip()   
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
#dev = list(read_dataset("data/ptb/valid.txt"))
i2w = {v.item(): k for k, v in w2i.items()}
nwords = len(w2i)

1 pierre <unk> N years old will join the board as a nonexecutive director nov. N
2 mr. <unk> is chairman of <unk> n.v. the dutch publishing group
3 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate
4 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported
5 the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said
6 <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in N
7 although preliminary findings were reported more than a year ago the latest results appear in today 's new england journal of medicine a forum likely to bring new attention to the problem
8 a <unk> <unk> said 

In [297]:
train[0]

[tensor(2),
 tensor(1),
 tensor(3),
 tensor(4),
 tensor(5),
 tensor(6),
 tensor(7),
 tensor(8),
 tensor(9),
 tensor(10),
 tensor(11),
 tensor(12),
 tensor(13),
 tensor(14),
 tensor(3)]

In [298]:
" ".join([i2w[i.item()] for i in train[0]])

'pierre <unk> N years old will join the board as a nonexecutive director nov. N'

In [92]:
class LogLin(torch.nn.Module):
    #Simple additive model
    def __init__(self, vocab_len):
        super(LogLin, self).__init__()
        
        self.embed1 = torch.nn.Embedding(vocab_len, vocab_len)
        self.embed2 = torch.nn.Embedding(vocab_len, vocab_len)
        self.bias = torch.nn.Parameter(torch.zeros(1, vocab_len))
        
    def forward(self, inputs):
        word1, word2 = inputs
        return F.log_softmax(self.embed1(word1)+self.embed2(word2)+self.bias, dim = 1)
    
model = LogLin(3)
model([torch.tensor(1),torch.tensor(0)])

tensor([[-1.5668, -0.9921, -0.8663]], grad_fn=<LogSoftmaxBackward>)

In [354]:
model = LogLin(nwords)

criterion = torch.nn.CrossEntropyLoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum = 0.5)

short_train = train #[:10]
for t in range(400):
    model.zero_grad()
    random.shuffle(short_train)
    tot_loss = 0.0
    for i, sentence in enumerate(short_train):
        hist = [S, S]
        losses = torch.tensor(0., device = device)
        for next_word in sentence+[S]:
            pred = model(hist)
            losses += criterion(pred, torch.tensor([next_word], device = device))
            hist = hist[1:] + [next_word]
        losses.backward()
        tot_loss += losses.item()
        optimizer.step()
    print t, tot_loss
                           

0 1175.51071167
1 1080.92725372
2 1024.51812363
3 968.134040833
4 921.705257416
5 866.729156494
6 821.252960205
7 773.63004303
8 733.343540192
9 692.92036438
10 653.750497818
11 623.500928879
12 587.504966736
13 553.747602463
14 522.264720917
15 492.827001572
16 465.51442337
17 440.890245438
18 413.369297028
19 385.621337891
20 363.250916481
21 340.145839691
22 317.310652733
23 299.64923954
24 281.001944542
25 267.551102638
26 252.59250164
27 237.609023094
28 223.816936493
29 210.883934021
30 199.161718369
31 190.241986275
32 177.557794571
33 169.100515842
34 160.201855183
35 153.005021095
36 144.864591599
37 139.812113285
38 133.562878132
39 128.6095047
40 124.265469551
41 120.325480461
42 116.817925453
43 112.718719006
44 108.837460518
45 105.59706068
46 103.520357132
47 100.644711494
48 98.091196537
49 95.7719306946
50 93.5239610672
51 92.0107827187
52 90.2010707855
53 88.9346356392
54 87.0445551872
55 85.9523730278
56 84.9849567413
57 83.5229434967
58 82.3433227539
59 80.9528694153

In [356]:
MAX_LEN = 100
with torch.no_grad():
    hist = [S] * N
    sent = []
    while True:
        p = model(hist)
        p = torch.exp(p[0]).numpy()
        next_word = np.random.choice(nwords, p=p/p.sum())
        # print next_word
        if next_word == S or len(sent) == MAX_LEN:
            break
        sent.append(next_word)
        hist = hist[1:] + [torch.tensor(next_word)]
        
    print [i2w[w] for w in sent]

['we', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties']


### Ukkk... This one is broken

## Fixed it

In [20]:
class NNLM(torch.nn.Module):

    def __init__(self, vocab_len, embedding_dim):
        super(NNLM, self).__init__()
        
        self.embed = torch.nn.Embedding(vocab_len, embedding_dim)
        self.linear = torch.nn.Linear(2*embedding_dim, vocab_len)
        
    def forward(self, inputs):
        word1, word2 = inputs
        embedded = torch.cat((self.embed(word1), self.embed(word2))).view(1,-1)
        # print embedded
        return F.log_softmax(self.linear(torch.tanh(embedded)), dim = 1 )
    
model = NNLM(3, 5)
model([torch.tensor(1),torch.tensor(0)])

tensor([[-0.7134, -1.6249, -1.1613]], grad_fn=<LogSoftmaxBackward>)

In [17]:
model = NNLM(2, 4)

criterion = torch.nn.CrossEntropyLoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

S = torch.tensor(0)

short_train = [[torch.tensor(1)]]
for t in range(10000):
    
    random.shuffle(short_train)
    tot_loss = 0.0
    for i, sentence in enumerate(short_train):
        hist = [S, S]
        losses = torch.tensor(0.)
        optimizer.zero_grad()
        for next_word in sentence[:1]+[S]:
            
            pred = model(hist)

            losses += criterion(pred, next_word.view(1))
            hist = hist[1:] + [next_word]
        losses.backward()
        tot_loss += losses.item()
        optimizer.step()
        
    if t%1000 == 999:
        print t, tot_loss/len(short_train)

999 0.0198105722666
1999 0.00922349840403
2999 0.0059280670248
3999 0.00434153573588
4999 0.00341386138462
5999 0.00280707702041
6999 0.00238014617935
7999 0.00206399708986
8999 0.00182056985795
9999 0.00162761716638


In [19]:
MAX_LEN = 10
with torch.no_grad():
    hist = [S] * 2
    sent = []
    while True:
        print hist
        p = model(hist)
        print p
        p = torch.exp(p).numpy()
        print p
        next_word = np.argmax(p)
        print next_word
        if next_word == S or len(sent) == MAX_LEN:
            print "breaking", len(sent), next_word , S
            break
        sent.append(next_word)
        hist = hist[1:] + [torch.tensor(next_word)]
        
    print [i2w[w] for w in sent]

[tensor(0), tensor(0)]
tensor([[-7.1069, -0.0008]])
[[8.193913e-04 9.991807e-01]]
1
[tensor(0), tensor(1)]
tensor([[-0.0008, -7.1217]])
[[9.9919254e-01 8.0740202e-04]]
0
breaking 1 0 tensor(0)
['<s>']


Something is very broken... not sure who I can ask for help from though...

Fixed it! 

# RNNs

And now lecture 6!

In [310]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_len, embed_dim, hidden_dim):
        super(RNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embed = torch.nn.Embedding(vocab_len, embed_dim)
        self.rnn = torch.nn.RNN(embed_dim, hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, vocab_len)
        self.hidden = None
        self.initialize()
    def forward(self, sentence):
        embedding = self.embed(sentence).view(len(sentence),1,-1)
        out, self.hidden = self.rnn(embedding, self.hidden)
        return F.log_softmax(self.linear(out), dim = 2).view(1,-1)
    
    def initialize(self):
        self.hidden = torch.zeros(1,1,self.hidden_dim)
    
rnn = RNN(3,16,2)
rnn(torch.tensor([0]))       

tensor([[-1.7678, -0.4086, -1.8036]], grad_fn=<ViewBackward>)

In [346]:
model = RNN(len(w2i),16,16)


criterion = torch.nn.CrossEntropyLoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum = 0.1)


train_short = train #[:500]
for t in range(1000):
    
    random.shuffle(train_short)
    train_loss = 0.0
    for sentence in train_short:

        optimizer.zero_grad()
        model.initialize()

        x = torch.tensor([w for w in sentence])
        last_word = S
        losses = torch.tensor(0.)
        for word in sentence + [S]:
            y_pred = model(last_word.view(1))
            losses += criterion(y_pred, torch.tensor([word]))
            last_word = word
        train_loss += losses.item()
        losses.backward()
        optimizer.step()
    if t%100 == 99:
        print t, train_loss/len(train)

99 10.4899823189
199 5.1687895298
299 3.99518003464
399 3.52835900784
499 3.28303370476
599 3.13431785107
699 2.98396658897
799 2.86056520939
899 2.7915494442
999 2.74182777405


In [348]:
MAX_LEN = 100
with torch.no_grad():
    model.initialize()
    hist = S
    sent = []
    while True:
        p = model(hist.view(1))
        p = torch.exp(p).numpy()[0]

        next_word = np.random.choice([torch.tensor(i) for i in range(len(w2i))], p = p/p.sum())
        if next_word == S or len(sent) == MAX_LEN:
            print "breaking", len(sent), next_word , S
            break
        sent.append(next_word)
        hist = torch.tensor(next_word)
        
    print [i2w[w] for w in sent]

breaking 34 0 tensor(0)
['a', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', 'N', 'years', 'ago', 'researchers', 'reported']


# Batching

In [4]:
training_size = 10

w2i = defaultdict(lambda: len(w2i))
PAD = w2i["<p>"]
S = w2i["<s>"]
UNK = w2i["<unk>"]
MAX_SENTENCE_LENGTH = -1

def read_dataset(filename):
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            if i > training_size:
                break
            print i, line.strip() 
            sentence = line.strip().split(" ") + ["<s>"]
            global MAX_SENTENCE_LENGTH
            if len(sentence) > MAX_SENTENCE_LENGTH:
                MAX_SENTENCE_LENGTH = len(sentence)
            yield [w2i[x] for x in sentence]

# Read in the data
train = list(read_dataset("data/ptb/train.txt"))
for t in train:
    while len(t) < MAX_SENTENCE_LENGTH:
        t.append(0)
    print t
            
w2i = defaultdict(lambda: UNK, w2i)

i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

1 pierre <unk> N years old will join the board as a nonexecutive director nov. N
2 mr. <unk> is chairman of <unk> n.v. the dutch publishing group
3 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate
4 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported
5 the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said
6 <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in N
7 although preliminary findings were reported more than a year ago the latest results appear in today 's new england journal of medicine a forum likely to bring new attention to the problem
8 a <unk> <unk> said 

In [5]:
class BatchRNN(torch.nn.Module):
    def __init__(self, vocab_len, batch_size, sentence_length, embed_dim, hidden_dim):
        super(BatchRNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.sentence_length = sentence_length
        self.batch_size = batch_size
        
        self.embed = torch.nn.Embedding(vocab_len, embed_dim, padding_idx=0)
        self.rnn = torch.nn.RNN(embed_dim, hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, vocab_len)
        self.hidden = None
        self.initialize()
    def forward(self, sentences):
        embedding = self.embed(sentences.t()).view(self.sentence_length,self.batch_size,self.embed_dim)
        out, self.hidden = self.rnn(embedding, self.hidden)
        return F.log_softmax(self.linear(out), dim = 2)
    
    def initialize(self):
        self.hidden = torch.zeros(1,self.batch_size, self.hidden_dim)
    
rnn = BatchRNN(vocab_len = 3, batch_size = 2, sentence_length = 5, embed_dim = 3, hidden_dim = 2)
rnn(torch.tensor([[1,0,0,1,0], [2,1,0,1,2]]))   

tensor([[[-0.4656, -1.7887, -1.5845],
         [-0.5929, -1.5279, -1.4686]],

        [[-0.5999, -1.4814, -1.4970],
         [-0.4917, -1.7055, -1.5762]],

        [[-0.6117, -1.4467, -1.5039],
         [-0.6115, -1.4541, -1.4969]],

        [[-0.5289, -1.6088, -1.5577],
         [-0.5232, -1.6227, -1.5605]],

        [[-0.6195, -1.4344, -1.4981],
         [-0.6419, -1.4143, -1.4669]]], grad_fn=<LogSoftmaxBackward>)

In [15]:
batch_size = 10

model = BatchRNN(vocab_len = len(w2i), batch_size = batch_size, 
                sentence_length = MAX_SENTENCE_LENGTH, embed_dim = 16, hidden_dim = 16)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

train_short = train[:batch_size]
print len(train_short) #[:500]
for t in range(500):
     
    optimizer.zero_grad()
    model.initialize()

    y_pred = model(torch.tensor(train_short))

    loss = loss_fn(y_pred.view(35, -1, batch_size), torch.tensor(train_short).t())

    loss.backward()
    optimizer.step()
    if t%100 == 99:
        print t, loss.item()/len(train_short)

10
99 0.056821000576
199 0.0209852322936
299 0.0152653649449
399 0.0136441558599
499 0.0163279771805
