In [138]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
from collections import Counter
import os
import math
random.seed(1024)

In [3]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

# Data Preprocessing

In [4]:
with open('corpus/corpus.txt', 'r', encoding="utf8") as f:
    data = f.read()

In [259]:
word_vocab = Counter()
char_vocab = Counter()
char_vocab.update(['{', '}'])
text_location = os.path.join(os.getcwd(), 'corpus/')
filenames = os.listdir(text_location)
counter = 0
for filename in filenames:
    filename = os.path.join(text_location, filename)
    counter += 1
    with open(filename, 'r', encoding='utf8') as f:
        line = f.read()
        word_vocab.update(line.lower().split())
        char_vocab.update(line)
    if counter % 100 == 0:
        print(counter)
print(word_vocab.most_common(5))

[('de', 35660), ('la', 11039), ('en', 9767), ('el', 9246), ('y', 8823)]


In [260]:
char_to_index = {e:n+1 for n, e in enumerate(char_vocab)}
index_to_char = {n+1:e for n, e in enumerate(char_vocab)}

In [261]:
WINDOW = 3
num_total_words = sum([num for word, num in word_vocab.items()])
unigram_table = []
Z = 0.001
for word in word_vocab:
    unigram_table.extend([word] * int(((word_vocab[word]/num_total_words)**0.75)/Z))

In [262]:
def get_negative(word):
    neg = random.choice(unigram_table)
    while neg == word.lower():
        neg = random.choice(unigram_table)
    return neg

def prepare_files(filenames):
    MIN_COUNT = 2
    for filename in filenames:
        with open(filename, 'r', encoding='utf8') as f:
            for line in f:
                words = line.split()
                max_j = len(words)
                for i, word in enumerate(words):
                    if word_vocab[word.lower()] <= MIN_COUNT:
                        continue
                    frequency = word_vocab[word.lower()] / num_total_words
                    number = 1 - math.sqrt(0.00005/frequency)
                    if random.uniform(0, 1) <= number:
                        continue
                    for j in range(i - WINDOW, i + WINDOW):
                        if (i == j) or (j < 0) or (j >= max_j):
                            continue
                        target = words[j]
                        negati = get_negative(word)
                        yield (word, target, negati)

def prepare_word(word, char_to_index):
    start = [char_to_index['{']]
    finish = [char_to_index['}']]
    return start + [char_to_index[char] for char in word] + finish

In [263]:
BUFFER_SIZE = 10000
def get_buffer(filenames, buffer_size):
    random.shuffle(filenames)
    buffer = []
    for word, target, negati in prepare_files(filenames):
        word = prepare_word(word, char_to_index)
        target = prepare_word(target, char_to_index)
        negati = prepare_word(negati, char_to_index)
        buffer.append([word, target, 1])
        buffer.append([word, negati, 0])
        if len(buffer) == buffer_size:
            yield buffer
            buffer = []
    yield buffer
    
def get_batch(filenames, buffer_size, batch_size):
    for buffer in get_buffer(filenames, buffer_size):
        random.shuffle(buffer)
        sindex = 0
        eindex = batch_size
        while eindex < len(buffer):
            batch = buffer[sindex:eindex]
            temp = eindex
            eindex = eindex + batch_size
            sindex = temp
            yield batch
        if eindex >= len(buffer):
            batch = buffer[sindex:]
            yield batch
            
def pad_to_batch(batch):
    sources, targets, y = zip(*batch)
    max_sources = max([len(w) for w in sources])
    max_targets = max([len(w) for w in targets])
    max_length = max([max_sources, max_targets])
    x_p, y_p = [], []
    for i in range(len(batch)):
        source = sources[i]
        target = targets[i]
        source = source + [0] * (max_length - len(source))
        target = target + [0] * (max_length - len(target))
        x_p.append(Variable(LongTensor(source + target)).view(1, -1))
        y_p.append(Variable(LongTensor([y[i]])))
    return torch.cat(x_p), torch.cat(y_p).view(-1)

In [264]:
text_location = os.path.join(os.getcwd(), 'corpus/')
filenames = [os.path.join(text_location, filename) for filename in os.listdir(text_location)]
batches = get_batch(filenames, BUFFER_SIZE, 100)
hey=pad_to_batch(next(batches))

In [265]:
hey[1].size()

torch.Size([100])

# Model

In [282]:
# embedding dimension 15
# filter widths = [1, 2, 3, 4, 5, 6, 7]
# filter dimens = [50, 100, 150, 200, 200, 200, 200]
# tanh
# highway network num 2
# relu activation

class Word2CNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, kernel_dims, kernel_sizes, 
                 dropout=0.5, highway_layers=2):
        super(Word2CNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, dim, (size, embedding_dim)) for dim, size in zip(kernel_dims, kernel_sizes)])
        self.dropout = nn.Dropout(dropout)
        internal_dim = sum(kernel_dims)
        self.hw_num_layers = highway_layers
        self.hw_nonlinear = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.hw_linear = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.hw_gate = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.final_layer = nn.Linear(internal_dim * 2, 2)
        
    def forward(self, inputs, is_training=False):
        inputs = inputs.view(inputs.size()[0]*2, -1) # each word on a line [B, MAX_LENGTH]
        print(inputs)
        inputs = self.embeddings(inputs).unsqueeze(1) # [BATCH, 1, MAX_LENGTH, EM_SIZE]
        inputs = [F.tanh(conv(inputs)).squeeze(3) for conv in self.convs] # [BATCH, K_DIM, MAX_LENGTH]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] # [BATCH, K_DIM]*len(Ks)
        inputs = torch.cat(inputs, 1) # [BATCH, K_DIM*len(Ks)]
        if is_training:
            inputs = self.dropout(inputs)
        for layer in range(self.hw_num_layers):
            gate = F.sigmoid(self.hw_gate[layer](inputs))
            nonlinear = F.relu(self.hw_nonlinear[layer](inputs))
            linear = self.hw_linear[layer](inputs)
            inputs = gate * nonlinear + (1 - gate) * linear
        if is_training:
            inputs = self.dropout(inputs)
        inputs = inputs.view(-1, inputs.size()[1]*2)
        out = self.final_layer(inputs)
        print(out)
        return F.log_softmax(out, 1)

In [283]:
EPOCH = 5
BATCH_SIZE = 50
EMBEDDING_SIZE = 15
KERNEL_SIZES = [1, 2, 3, 4, 5, 6, 7]
KERNEL_DIMEN = [50, 100, 150, 200, 200, 200, 200]
LR = 0.001
vocab_size = len(char_to_index) + 1

In [284]:
model = Word2CNN(vocab_size, EMBEDDING_SIZE, KERNEL_DIMEN, KERNEL_SIZES)
if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [285]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(get_batch(filenames, BUFFER_SIZE, BATCH_SIZE)):
        inputs, targets = pad_to_batch(batch)
        model.zero_grad()
        preds = model(inputs, True)
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

Variable containing:
    1    25    17  ...      0     0     0
    1    31     9  ...      0     0     0
    1    30    12  ...      0     0     0
       ...          ⋱          ...       
    1    27     9  ...      0     0     0
    1    67    40  ...      0     0     0
    1    50    17  ...      0     0     0
[torch.cuda.LongTensor of size 100x17 (GPU 0)]

Variable containing:
-0.1078  0.0481
-0.2579  0.0491
-0.0554 -0.1255
 0.0548 -0.0363
 0.1613  0.0865
 0.2053 -0.0765
-0.0818 -0.1335
 0.1077 -0.0264
 0.0670  0.0087
-0.0139 -0.0257
-0.0641 -0.0886
 0.0582  0.1069
-0.0683 -0.0539
 0.0640  0.0799
-0.0279  0.0294
-0.0238 -0.0278
-0.0283 -0.0233
-0.0470  0.0798
 0.0199 -0.0972
-0.0290  0.0180
 0.1005  0.1255
 0.0288  0.0272
 0.0134  0.0008
-0.0168 -0.0003
-0.0360  0.0400
 0.0767  0.2052
-0.0313  0.0906
 0.0013 -0.1076
-0.0438  0.2225
 0.0575  0.0938
 0.0792  0.1072
 0.0673 -0.1498
 0.0162 -0.0218
-0.1489 -0.0034
 0.0458  0.0178
 0.0461 -0.2016
 0.0701 -0.1195
 0.0622  0.1631
 0.0148 

Variable containing:
    1    69     2  ...      0     0     0
    1     9     7  ...      0     0     0
    1    64    12  ...      0     0     0
       ...          ⋱          ...       
    1    30    12  ...      0     0     0
    1    54    17  ...      0     0     0
    1    20    22  ...      0     0     0
[torch.cuda.LongTensor of size 100x17 (GPU 0)]

Variable containing:
-0.1722  0.3678
 0.1864  0.0001
 0.2299 -0.2985
 0.1777 -0.3191
 0.4322 -0.1024
 0.1575 -0.1470
 0.1519  0.4017
 0.2276 -0.2904
-0.1661 -0.2846
-0.3359  0.2363
-0.0244  0.1708
 0.1412 -0.0923
-0.1164  0.0339
 0.1390 -0.2209
 0.2244  0.0824
 0.0552  0.0242
 0.2038 -0.3323
 0.1909 -0.2269
 0.1420 -0.3295
-0.0436 -0.0170
-0.2941  0.1735
-0.1790  0.0668
 0.0371 -0.3228
-0.0603  0.2204
 0.1160 -0.2929
 0.3514 -0.1255
 0.3536 -0.5763
 0.0019  0.1553
 0.3479 -0.1943
-0.1844  0.1329
 0.0799 -0.2102
-0.2217 -0.0991
-0.4369  0.3264
 0.0192 -0.0602
 0.1461 -0.0986
-0.0015  0.0750
-0.3792 -0.1343
 0.2374 -0.1843
 0.0774 

Variable containing:
 0.2264 -0.1020
 0.1087  0.1560
 0.1531 -0.3440
-0.0751  0.1296
 0.2951 -0.0882
-0.1166  0.0876
-0.0943 -0.2790
-0.2794  0.3314
 0.6901 -0.3973
 0.3068 -0.1883
-0.4844  0.4040
 0.1846 -0.3335
 0.7391 -0.7061
 0.4356 -0.0852
-0.1305 -0.2672
 0.2304 -0.3494
 0.1153 -0.1589
 0.3414 -0.3493
 0.1466 -0.2938
-0.1228  0.0985
 0.0393  0.0020
 0.3141 -0.4036
 0.1152 -0.1906
-0.1909 -0.0154
 0.2585 -0.2224
-0.0110 -0.2810
-0.9865  0.9899
 0.3361 -0.0623
 0.1672  0.0835
 0.1776 -0.2430
 0.3681 -0.4036
 0.1505  0.1681
 0.3354 -0.3067
 0.7810 -0.5872
 0.5020 -0.4568
 0.2045 -0.3539
 0.0657 -0.1850
 0.0321 -0.3243
-0.2873  0.3326
-0.1682  0.2374
-0.2254  0.1707
-0.1542  0.1626
 0.0585 -0.0402
 0.1934 -0.6640
 0.3761 -0.2453
 0.1295 -0.3164
-0.1046 -0.0366
 0.5307 -0.6008
 0.1582 -0.3965
-0.1824  0.3655
[torch.cuda.FloatTensor of size 50x2 (GPU 0)]

Variable containing:
    1    30    12  ...      0     0     0
    1    30    12  ...      0     0     0
    1    40    38  ...     

Variable containing:
-0.2549  0.3467
 0.0810 -0.1714
 0.3622  0.1061
 0.0420 -0.1877
 0.2462 -0.3060
 0.2532 -0.3038
 0.0233 -0.3324
-0.1255 -0.2108
 0.2218  0.1662
-0.3806  0.0697
-0.0757 -0.2226
-0.0554  0.1240
 0.2995 -0.5284
 0.3806 -0.2085
-0.1967  0.3153
-0.3318  0.1817
-0.1017  0.3897
-0.2967  0.2999
-0.0163  0.0188
 0.1652 -0.4312
 0.1030 -0.2887
 0.0170 -0.0981
-0.3079 -0.0151
-0.1409  0.3523
-0.2624 -0.1266
 0.7873 -1.2846
-0.2726  0.3364
 0.0140 -0.5259
 0.1936  0.0972
 0.1867 -0.1289
 0.2936 -0.4485
 0.0826  0.0875
 0.1069 -0.1493
 0.5570 -0.7197
-0.0122 -0.0445
-0.5210  0.4403
-0.1608 -0.1248
 0.0136 -0.3142
-0.2126 -0.2440
 0.4415 -0.0798
-0.1709  0.2001
 0.3112  0.1637
-0.1985 -0.0021
-0.0441 -0.1307
 0.6216 -0.4642
-0.1438 -0.1991
 0.3032 -0.3643
-0.1248  0.0390
-0.1734  0.0363
-0.1509  0.3812
[torch.cuda.FloatTensor of size 50x2 (GPU 0)]

Variable containing:
    1    42    22  ...      0     0     0
    1    30    34  ...      0     0     0
    1    30    34  ...     

Variable containing:
    1    39    47  ...      0     0     0
    1    31     6  ...      0     0     0
    1    30    51  ...      0     0     0
       ...          ⋱          ...       
    1     9     7  ...      0     0     0
    1    24    23  ...      0     0     0
    1    69     2  ...      0     0     0
[torch.cuda.LongTensor of size 100x16 (GPU 0)]

Variable containing:
 0.4004 -0.4199
 0.6604 -0.7109
 0.2379 -0.0277
-0.5304  0.6253
 0.5203 -0.2263
 0.0780  0.1405
-0.4165  0.3230
-0.0495  0.1336
 0.0917  0.3610
 0.3112 -0.2034
 0.7800 -0.7686
 0.9063 -0.4247
 0.7116 -0.4650
 0.2383 -0.1333
 0.6110 -0.1636
 0.4327 -0.2777
-0.0171 -0.3058
-1.2877  1.0203
-0.1370 -0.0600
-0.1150 -0.0171
 0.5961 -0.6246
 0.4849 -0.3397
 0.4423 -0.1272
 0.3911 -0.2344
 0.3531 -0.2353
-0.0058 -0.0303
 0.1789 -0.3564
 0.6434 -0.1685
 0.2847 -0.0936
-0.6620  0.8750
-0.0429  0.1107
-0.1502  0.0069
 0.6869 -0.5333
-1.2052  1.4013
-0.2708 -0.0499
 0.2288 -0.3624
 0.5259 -0.3198
-0.0448  0.4022
 0.1647 

Variable containing:
    1    58    33  ...      0     0     0
    1    25    17  ...      0     0     0
    1    58    38  ...      0     0     0
       ...          ⋱          ...       
    1    31    16  ...      0     0     0
    1    33    12  ...      0     0     0
    1    25    17  ...      0     0     0
[torch.cuda.LongTensor of size 100x17 (GPU 0)]

Variable containing:
 6.3837e-02 -1.8317e-01
 2.8375e-01 -4.9649e-01
-3.5139e-01  3.3012e-01
-4.6107e-01  3.9992e-01
 1.8910e-01 -2.9344e-01
-8.4010e-01  5.4291e-01
 4.8514e-01 -5.2628e-01
-6.2115e-05  6.3925e-03
-1.1338e-02 -1.6008e-01
-4.2512e-01  1.9836e-01
-5.3324e-01  2.8679e-01
 3.1569e-02  1.0280e-02
 3.4547e-01 -6.1524e-01
 4.7303e-01 -6.1701e-01
 6.5105e-01 -6.3789e-01
-6.6967e-01  3.1256e-01
 1.8889e-01 -2.0818e-01
 2.2237e-02 -3.0729e-01
-4.3646e-01  2.0168e-01
-1.1006e-01 -1.2581e-01
 4.1050e-01 -6.8426e-01
 2.1574e-01 -7.5304e-02
 1.9712e-01 -1.6778e-01
 8.1949e-02 -1.7829e-01
-1.5502e+00  1.2643e+00
-2.7951e-01  1.0

Variable containing:
    1    42    23  ...      0     0     0
    1    17     7  ...      0     0     0
    1    54    17  ...      0     0     0
       ...          ⋱          ...       
    1    25    17  ...      0     0     0
    1    35    12  ...      0     0     0
    1    20     6  ...      0     0     0
[torch.cuda.LongTensor of size 100x19 (GPU 0)]

Variable containing:
 0.5031 -0.4795
-0.2877  0.1716
 0.1395 -0.2878
-0.0756 -0.0957
-0.1840  0.1651
 0.4858 -0.4236
 0.4152 -0.4201
-0.1704 -0.0260
 0.2294 -0.3894
 0.5848 -0.6932
-0.1479  0.1009
 0.3095 -0.2137
 0.7082 -0.7825
 0.7359 -0.6032
-0.1805  0.0863
-0.3001  0.2360
 0.5262 -0.5906
-0.3548  0.5601
 0.2348 -0.2990
 0.6994 -0.6360
 0.6643 -0.6530
 0.0185  0.0762
 0.4451 -0.6172
 0.2880 -0.2353
-0.3683  0.1940
-0.1181  0.1371
 0.1393 -0.1025
-0.0703 -0.0749
 0.0963 -0.3780
-0.0803  0.0911
 0.9498 -0.9475
 0.3696 -0.3130
 0.1031 -0.0284
 0.0651 -0.0944
 0.1210 -0.1419
 0.1730 -0.4310
-0.2557  0.1592
-0.1084 -0.0468
 0.8944 

Variable containing:
    1    33    12  ...      0     0     0
    1    27    22  ...      0     0     0
    1    39    17  ...      0     0     0
       ...          ⋱          ...       
    1    33    12  ...      0     0     0
    1    58    62  ...      2     0     0
    1    35    12  ...      0     0     0
[torch.cuda.LongTensor of size 100x15 (GPU 0)]

Variable containing:
 0.3324 -0.3646
-0.4223  0.3592
 0.4606 -0.5547
 0.4500 -0.4798
 0.7607 -0.8363
 0.8475 -0.9082
-0.0710  0.0639
 0.0446 -0.0232
 0.7862 -0.9298
 0.0514 -0.1375
 0.0804 -0.0032
 0.6298 -0.6194
 0.6651 -0.6412
 0.2779 -0.4098
 0.3512 -0.4785
 0.3398 -0.4052
-0.3884  0.4528
 0.5998 -0.4794
 0.2934 -0.5176
-0.0082 -0.0668
 0.3305 -0.6448
 0.6087 -0.7019
 0.3365 -0.3788
 0.7154 -0.7350
-0.1794  0.0302
 1.0687 -1.0807
 0.6949 -0.8158
 0.5116 -0.6016
 0.1070 -0.1486
 0.3710 -0.3913
-0.2298  0.1293
-0.6302  0.6893
-0.0981  0.0369
 0.3426 -0.3385
 0.5115 -0.4744
 0.1224 -0.2408
-0.6230  0.5340
 0.2724 -0.2042
-0.4912 

Variable containing:
    1    51    12  ...      0     0     0
    1    42     6  ...      0     0     0
    1    42    23  ...      0     0     0
       ...          ⋱          ...       
    1    24    22  ...      0     0     0
    1    23     9  ...      0     0     0
    1    17    19  ...      0     0     0
[torch.cuda.LongTensor of size 100x16 (GPU 0)]

Variable containing:
-0.1229 -0.1040
 0.1592 -0.1509
 0.4927 -0.4391
-0.0860  0.1074
-0.4114  0.4125
 0.3841 -0.3911
-0.8771  0.6762
 0.4557 -0.4083
 0.7796 -1.2257
-0.6751  0.3928
 0.1257 -0.0619
 0.2329 -0.1172
 0.7036 -0.7420
 0.6235 -0.7913
 0.7458 -0.7295
-1.0545  1.0773
-0.1457  0.2039
-1.6986  1.6714
 0.2380 -0.3089
 0.6104 -0.6504
-0.0511 -0.0480
-0.0699 -0.0380
 0.3456 -0.2107
-0.4782  0.4558
 0.1014 -0.1532
-0.2489 -0.0542
-0.3795  0.3609
 1.1758 -1.0544
 0.1189 -0.1585
 0.9459 -0.9722
-0.2366  0.4032
-0.1415  0.0293
 0.8228 -0.8485
-0.4201  0.2702
-0.7243  0.7243
 0.3114 -0.4158
-0.5137  0.4913
 0.1104 -0.2063
-0.0597 

KeyboardInterrupt: 

In [258]:
[1] + [2,3] + [4]

[1, 2, 3, 4]

In [271]:
Variable(torch.LongTensor(3).random_(5))

Variable containing:
 3
 1
 1
[torch.LongTensor of size 3]

In [272]:
Variable(torch.randn(3, 5), requires_grad=True)

Variable containing:
-1.5904  0.6392 -0.7506 -0.7602 -0.6794
-0.1573 -1.4171 -0.1542 -0.3880  1.0029
 0.6868  1.8002 -0.3484  1.5169  0.7620
[torch.FloatTensor of size 3x5]