# BOW and CBOW Sentiment

Not totally sure which of these are really BOW and CBOW or some mixture between the two...

Also, this is from the [CMU](http://phontron.com/class/nn4nlp2017/index.html) class (if that wasn't apparent from the name)


In [1]:
import torch
import torch.nn.functional as F
from collections import defaultdict
import random
import numpy as np

In [2]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

In [3]:
# Read in the data
train = list(read_dataset("data/classes/train.txt"))
random.shuffle(train)
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("data/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [5]:
train_sents = open("data/classes/train.txt").readlines()
print (len(train_sents))
train_sents[0]

8544


"3 ||| The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .\n"

In [6]:
class CBOW(torch.nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()       
        self.embeddings = torch.nn.Embedding(len(w2i), len(t2i))
        self.bias = torch.nn.Parameter(torch.zeros(len(t2i)).view(1,-1))

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return F.log_softmax(embeds.sum(dim=0).view(1,-1) + self.bias, dim=1)
    
class CBOW2(torch.nn.Module):
    def __init__(self, hidden_dim):
        super(CBOW2, self).__init__()       
        self.embeddings = torch.nn.Embedding(len(w2i), hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, len(t2i))

    def forward(self, inputs):
        embeds = self.embeddings(inputs).sum(dim=0).view(1,-1)
        return F.log_softmax(self.linear(embeds), dim=1)
    
class CBOW3(torch.nn.Module):
    def __init__(self, hidden_dim):
        super(CBOW3, self).__init__()       
        self.embeddings = torch.nn.Embedding(len(w2i), hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, len(t2i))
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).sum(dim=0).view(1,-1)
        out = self.dropout(self.linear(embeds))
        return F.log_softmax(out, dim=1)
    
class BOW(torch.nn.Module):
    def __init__(self, embedding_dim):
        super(BOW, self).__init__()       
        self.embeddings = torch.nn.Embedding(len(w2i), len(t2i))

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return F.log_softmax(embeds, dim=1).sum(dim=0).view(1,-1)

In [14]:
model = CBOW()
model.train()
criterion = torch.nn.CrossEntropyLoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
for t in range(40):
    
    random.shuffle(train)
    train_loss = 0.0
    for words, tag in train:
        optimizer.zero_grad()
        x = torch.tensor([w for w in words])
        y_pred = model(x)

        loss = criterion(y_pred, torch.tensor([tag]))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print (t, train_loss/len(train))

0 3.709558172132933
1 2.465143023814163
2 1.834459099004429
3 1.435342233773243
4 1.144472440846312
5 0.9405416068535692
6 0.7848543223704214
7 0.6594357753147315
8 0.5684988972053117
9 0.4946089000837584
10 0.4470698067193624
11 0.3983736683385701
12 0.3577868537908604
13 0.32941330506672245
14 0.29942043866076046
15 0.27762210196943515
16 0.2591409451355754
17 0.24231784828653072
18 0.2278954951667317
19 0.2145715494312648
20 0.20294356728154692
21 0.19202283009782165
22 0.18433360276220828
23 0.1751796147665375
24 0.16770824875247936
25 0.16079857511569265
26 0.15473037727163283
27 0.1480980140488112
28 0.1435162738673898
29 0.13818768860316188
30 0.13440050533678682
31 0.1294369723726822
32 0.12572743762643343
33 0.1221397802162686
34 0.11838425910560799
35 0.11487677544010727
36 0.11179136968183917
37 0.10863384081444626
38 0.10631993477358817
39 0.10367241447304806


In [15]:
model.eval()
with torch.no_grad():
    correct = 0
    for words, tag in dev:
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        correct += torch.argmax(y_pred, dim = 1).item() == tag

    print ("dev", float(correct)/len(dev))
    correct = 0
    for words, tag in train:
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        correct += torch.argmax(y_pred, dim = 1).item() == tag

    print ("Train", float(correct)/len(train))  

dev 0.36787330316742084
Train 0.9921582397003745


In [29]:
# model = CBOW3(50)
# model.train()
# criterion = torch.nn.CrossEntropyLoss(reduction="sum")
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
# for t in range(1):
    
#     random.shuffle(train)
#     train_loss = 0.0
#     for words, tag in train:
#         optimizer.zero_grad()
#         x = torch.tensor([w for w in words])
#         y_pred = model(x)

#         loss = criterion(y_pred, torch.tensor([tag]))
#         train_loss += loss.item()
#         loss.backward()
#         optimizer.step()

#     print (t, train_loss/len(train))

0 0.98368786606


In [30]:
# model.eval()
# with torch.no_grad():
#     correct = 0
#     for words, tag in dev:
#         x = torch.tensor([w for w in words])
#         y_pred = model(x)
#         correct += torch.argmax(y_pred, dim = 1).item() == tag

#     print ("dev", float(correct)/len(dev))
#     correct = 0
#     for words, tag in train:
#         x = torch.tensor([w for w in words])
#         y_pred = model(x)
#         correct += torch.argmax(y_pred, dim = 1).item() == tag

#     print ("Train", float(correct)/len(train)  )

dev 0.354751131222
Train 0.949672284644


In [9]:
class Sentiment(torch.nn.Module):
    def __init__(self):
        super(Sentiment, self).__init__()       
        self.embeddings = torch.nn.Embedding(len(w2i), 1)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        # print "embeds", embeds.sum(dim=0)
        return torch.sigmoid(embeds.sum(dim=0))

In [10]:
model = Sentiment()
criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-0)

for t in range(40):
    
    random.shuffle(train)
    train_loss = 0.0
    for words, tag in train:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        # print y_pred
        loss = criterion(y_pred, torch.tensor([tag]))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print t, train_loss/len(train)

0 2.41802108698
1 1.33709841283
2 0.835866116547
3 0.586966595206
4 0.435553118778
5 0.33201093033
6 0.250492178666
7 0.199820565287
8 0.149695967955
9 0.118060995762
10 0.0773955546789
11 0.0685972794444
12 0.0529615695594
13 0.0337114673106
14 0.0321699700949
15 0.0310849630382
16 0.0286231650666
17 0.0267585572528
18 0.0245588197573
19 0.0224945617789
20 0.0222473219775
21 0.0217625251784
22 0.0217446602817
23 0.0215785687158
24 0.0214353888665
25 0.021317885629
26 0.0213136909239
27 0.0212321412386
28 0.0212236477079
29 0.0211467029458
30 0.0210595684644
31 0.02102685447
32 0.020967209591
33 0.0209652713155
34 0.0209372561681
35 0.0208828903269
36 0.0208587476873
37 0.0208273177294
38 0.0208158793943
39 0.0207795743961


In [11]:
model.eval()
with torch.no_grad():
    correct = 0
    using = 0
    for words, tag in dev:
        if tag == 2:
            continue
        using += 1
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        correct += abs(tag - y_pred.item()) < 0.5

    print "dev", float(correct)/using
    correct = 0
    using = 0
    for words, tag in train:
        if tag == 2:
            continue
        using += 1
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        # print y_pred
        correct += abs(tag - y_pred.item()) < 0.5

    print "Train", float(correct)/using  

dev 0.783635365184
Train 0.999132947977


# CNNs

In [176]:
class CNN(torch.nn.Module):
    def __init__(self, vocab_size, window_size, embedding_dim, output_channels):
        super(CNN, self).__init__()
        
        assert window_size%2 == 1 #odd window sizes please
        
        self.embedding_dim = embedding_dim
        self.output_channels = output_channels
        
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.cnn = torch.nn.Conv1d(embedding_dim, output_channels, window_size, padding = (window_size - 1)/2) 
        self.linear = torch.nn.Linear(output_channels, 1)
        
    def forward(self, inputs):
        embedded = self.embed(inputs).t().view(1,self.embedding_dim, -1)
        convoluted = self.cnn(embedded)
        pooled = F.relu(F.max_pool1d(convoluted, len(inputs)))
        return torch.sigmoid(self.linear(pooled.view(self.output_channels)))
    
cnn = CNN(5,3,32,16)
input = torch.tensor([0,1,2,1,4]) 
# print input
cnn(input)

tensor([0.4653], grad_fn=<SigmoidBackward>)

In [186]:
model = CNN(len(w2i),3,16,16)

criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum = 0.1)

for t in range(40):
    
    random.shuffle(train)
    train_loss = 0.0
    for words, tag in train:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        # print y_pred
        loss = criterion(y_pred, torch.tensor([tag]))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print t, train_loss/len(train)

0 0.559116127963
1 0.546992398338
2 0.537920873117
3 0.529935765009
4 0.522986482761
5 0.514049974196
6 0.506678677761
7 0.496691504355
8 0.484541071418
9 0.478142225802
10 0.47074982075
11 0.458209036435
12 0.449260221628
13 0.438598727762
14 0.427408823783
15 0.418008087933
16 0.409858000395
17 0.396625886976
18 0.38380466141
19 0.371819286575
20 0.357757684014
21 0.34230660512
22 0.333373975069
23 0.312854016985
24 0.296530620671
25 0.282677547839
26 0.266663719459
27 0.242970353538
28 0.228311647407
29 0.211016473039
30 0.195309464113
31 0.1568078539
32 0.141335186566
33 0.12904788398
34 0.107520385369
35 0.0945435691704
36 0.0705132258136
37 0.0588438163109
38 0.0405577773309
39 0.026601341458


In [188]:
with torch.no_grad():
    correct = 0
    using = 0
    for words, tag in dev:
        if tag == 2:
            continue
        using += 1
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        correct += abs(tag - y_pred.item()) < 0.5

    print "dev", float(correct)/using
    correct = 0
    using = 0
    for words, tag in train:
        if tag == 2:
            continue
        using += 1
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        x = torch.tensor([w for w in words])
        y_pred = model(x)
        # print y_pred
        correct += abs(tag - y_pred.item()) < 0.5

    print "Train", float(correct)/using  

dev 0.668863261944
Train 0.996242774566


# RNNs

In [16]:
class SimpleRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleRNN, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim) 
        self.linear = torch.nn.Linear(hidden_dim, 1)
        self.hidden = None
        
    def forward(self, inputs):
        embedded = self.embed(inputs).view(len(inputs),1,-1)
        #print embedded
        out, self.hidden = self.rnn(embedded, self.hidden)
        return torch.sigmoid(self.linear(out)[-1].view(1,1))
    
    def initialize(self):
        self.hidden = torch.zeros(1,1, self.hidden_dim)
        
rnn = SimpleRNN(3,5,2)
rnn(torch.tensor([1,0,1,0]))

tensor([[0.5408]], grad_fn=<SigmoidBackward>)

In [17]:
model = SimpleRNN(len(w2i),16,16)


criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum = 0.9)


train = train #[:500]
for t in range(40):
    
    random.shuffle(train)
    train_loss = 0.0
    for words, tag in train:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        model.initialize()
        for w in words:
            x = torch.tensor([w])
            y_pred = model(x)

        loss = criterion(y_pred, torch.tensor([tag]).view(1,1))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print (t, train_loss/len(train))

KeyboardInterrupt: 

In [22]:
model = SimpleRNN(len(w2i),16,16)


criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum = 0.9)


train = train[:500]
for t in range(40):
    
    random.shuffle(train)
    train_loss = 0.0
    for words, tag in train:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        model.initialize()

        x = torch.tensor([w for w in words])
        y_pred = model(x)

        loss = criterion(y_pred, torch.tensor([tag]).view(1,1))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print (t, train_loss/len(train))

0 0.5692605257034302
1 0.5648662068843842
2 0.5603104606866837
3 0.5570976375937462
4 0.5519396099448204
5 0.5455788746476173
6 0.540050355732441
7 0.5390247922837734
8 0.5301990541517735
9 0.5262349545657635
10 0.5202215259671211
11 0.5126709409654141
12 0.5063853273987771
13 0.5022973581552506
14 0.493425964474678
15 0.4844797843694687
16 0.4785664509385824
17 0.4577317737787962
18 0.4482614317238331
19 0.43179953045397995
20 0.40643735373392703
21 0.3951086123175919
22 0.36012182890996336
23 0.3329659046009183
24 0.30500614741444587
25 0.2707595526473597
26 0.24816703260596842
27 0.21749827189650386
28 0.2169035478234291
29 0.1641898257934954
30 0.13485574983432888
31 0.11247693310817704
32 0.09395424605731387
33 0.1744057827568613
34 0.19947989458573284
35 0.10215563367280993
36 0.054995651553676
37 0.03323686621818342
38 0.022800288593192816
39 0.016619373231915233


In [72]:
class BiRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiRNN, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, bidirectional=True) 
        self.linear = torch.nn.Linear(2*hidden_dim, 1)
        self.hidden = None
        
    def forward(self, inputs):
        embedded = self.embed(inputs).view(len(inputs),1,-1)
        out, self.hidden = self.rnn(embedded, self.hidden)
        out = torch.cat((out[-1][0][0:self.hidden_dim], out[0][0][self.hidden_dim:])).view(1,-1)
        return torch.sigmoid(self.linear(out))
    
    def initialize(self):
        self.hidden = torch.zeros(2,1, self.hidden_dim)
        
rnn = BiRNN(3,1,2)
rnn(torch.tensor([1,0,1,0,1,0,1]))

tensor([[0.5921]], grad_fn=<SigmoidBackward>)

In [84]:
model = BiRNN(len(w2i),16,16)


criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum = 0.1)


train_short = train[:500]
for t in range(40):
    
    random.shuffle(train_short)
    train_loss = 0.0
    for words, tag in train_short:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        model.initialize()

        x = torch.tensor([w for w in words])
        y_pred = model(x)

        loss = criterion(y_pred, torch.tensor([tag]))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print t, train_loss/len(train)

0 0.0333554408886
1 0.0330993935801
2 0.0329459922123
3 0.0328314971276
4 0.0327541858118
5 0.032677121
6 0.0326076899812
7 0.0325467582576
8 0.0324889212247
9 0.0324354128535
10 0.0323851123724
11 0.0323335546093
12 0.0322865425131
13 0.0322400705701
14 0.0321906176657
15 0.032143055859
16 0.0321025188684
17 0.0320543658567
18 0.032011705067
19 0.0319664685146
20 0.03191965997
21 0.0318717082286
22 0.0318258629121
23 0.0317757621038
24 0.0317296407112
25 0.0316744882523
26 0.0316320694146
27 0.031581087596
28 0.0315288390061
29 0.0314772245213
30 0.0314072049886
31 0.0313700506976
32 0.0313115433636
33 0.0312559163836
34 0.0311904747339
35 0.0311333001016
36 0.0310749847543
37 0.0310104609945
38 0.0309397878303
39 0.0308863904732


In [79]:
class BiLSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) 
        self.linear = torch.nn.Linear(2*hidden_dim, 1)
        self.hidden = None
        
    def forward(self, inputs):
        embedded = self.embed(inputs).view(len(inputs),1,-1)
        out, self.hidden = self.rnn(embedded, self.hidden)
        out = torch.cat((out[-1][0][0:self.hidden_dim], out[0][0][self.hidden_dim:])).view(1,-1)
        return torch.sigmoid(self.linear(out))
    
    def initialize(self):
        self.hidden = (torch.zeros(2,1, self.hidden_dim), torch.zeros(2,1, self.hidden_dim))
        
rnn = BiLSTM(3,1,2)
rnn(torch.tensor([1,0,1,0,1,0,1]))

tensor([[0.4904]], grad_fn=<SigmoidBackward>)

In [83]:
model = BiLSTM(len(w2i),16,16)


criterion = torch.nn.BCELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum = 0.9)


train_short = train #[:500]
for t in range(40):
    
    random.shuffle(train_short)
    train_loss = 0.0
    for words, tag in train_short:
        if tag == 2:
            continue
        if tag < 2:
            tag = 0.0
        if tag > 2:
            tag = 1.0
        optimizer.zero_grad()
        model.initialize()

        x = torch.tensor([w for w in words])
        y_pred = model(x)

        loss = criterion(y_pred, torch.tensor([tag]))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    print t, train_loss/len(train)

0 0.564201382371
1 0.550223903024
2 0.526209019758
3 0.498078458186
4 0.460739228106
5 0.416743377978
6 0.367283961047
7 0.312497644719
8 0.257001315494
9 0.206403194881
10 0.164677421874
11 0.118634913678
12 0.105030022414
13 0.0839344270636
14 0.0520474091161
15 0.0322553226341
16 0.0174046786955
17 0.0456262493326
18 0.0777544113805
19 0.0908013209474
20 0.0625696961775
21 0.0277768347734
22 0.0122808843571
23 0.00462367299147
24 0.00157571853174
25 0.000983569798029
26 0.000766159769663
27 0.000631276969493
28 0.000539195835022
29 0.000471172976379
30 0.000418577219913
31 0.000376040047462
32 0.000341003955958
33 0.000311871213067
34 0.000287183237383
35 0.000266152927249
36 0.000247947381376
37 0.000231780313911
38 0.000217635858367
39 0.00020525988874
