In [5]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
import os
import math
random.seed(0)

In [3]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

# Data Preprocessing

In [7]:
word_vocab = Counter()
char_vocab = Counter()
char_vocab.update(['{', '}'])
text_location = os.path.join(os.getcwd(), 'corpus/')
filenames = os.listdir(text_location)
for filename in filenames:
    filename = os.path.join(text_location, filename)
    with open(filename, 'r', encoding='utf8') as f:
        line = f.read()
        word_vocab.update(line.lower().split())
        char_vocab.update(line)
print(word_vocab.most_common(5))

[('de', 35660), ('la', 11039), ('en', 9767), ('el', 9246), ('y', 8823)]


In [8]:
# +1 as 0 is the PAD
char_to_index = {e:n+1 for n, e in enumerate(char_vocab)}
index_to_char = {n+1:e for n, e in enumerate(char_vocab)}

In [9]:
WINDOW = 3
num_total_words = sum([num for word, num in word_vocab.items()])
unigram_table = []
Z = 0.001
for word in word_vocab:
    unigram_table.extend([word] * int(((word_vocab[word]/num_total_words)**0.75)/Z))

In [10]:
def get_negative(word):
    neg = random.choice(unigram_table)
    while neg == word.lower():
        neg = random.choice(unigram_table)
    return neg

def prepare_files(filenames):
    MIN_COUNT = 2
    for filename in filenames:
        with open(filename, 'r', encoding='utf8') as f:
            for line in f:
                words = line.split()
                max_j = len(words)
                for i, word in enumerate(words):
                    if word_vocab[word.lower()] <= MIN_COUNT:
                        continue
                    frequency = word_vocab[word.lower()] / num_total_words
                    number = 1 - math.sqrt(0.00005/frequency)
                    if random.uniform(0, 1) <= number:
                        continue
                    for j in range(i - WINDOW, i + WINDOW):
                        if (i == j) or (j < 0) or (j >= max_j):
                            continue
                        target = words[j]
                        negati = get_negative(word)
                        yield (word, target, negati)

def prepare_word(word, char_to_index):
    start = [char_to_index['{']]
    finish = [char_to_index['}']]
    return start + [char_to_index[char] for char in word] + finish

In [11]:
BUFFER_SIZE = 10000
def get_buffer(filenames, buffer_size):
    random.shuffle(filenames)
    buffer = []
    for word, target, negati in prepare_files(filenames):
        word = prepare_word(word, char_to_index)
        target = prepare_word(target, char_to_index)
        negati = prepare_word(negati, char_to_index)
        buffer.append([word, target, 1])
        buffer.append([word, negati, 0])
        if len(buffer) == buffer_size:
            yield buffer
            buffer = []
    yield buffer
    
def get_batch(filenames, buffer_size, batch_size):
    for buffer in get_buffer(filenames, buffer_size):
        random.shuffle(buffer)
        sindex = 0
        eindex = batch_size
        while eindex < len(buffer):
            batch = buffer[sindex:eindex]
            temp = eindex
            eindex = eindex + batch_size
            sindex = temp
            yield batch
        if eindex >= len(buffer):
            batch = buffer[sindex:]
            yield batch
            
def pad_to_batch(batch):
    sources, targets, y = zip(*batch)
    max_sources = max([len(w) for w in sources])
    max_targets = max([len(w) for w in targets])
    max_length = max([max_sources, max_targets])
    x_p, y_p = [], []
    for i in range(len(batch)):
        source = sources[i]
        target = targets[i]
        source = source + [0] * (max_length - len(source))
        target = target + [0] * (max_length - len(target))
        x_p.append(Variable(LongTensor(source + target)).view(1, -1))
        y_p.append(Variable(LongTensor([y[i]])))
    return torch.cat(x_p), torch.cat(y_p).view(-1)

In [12]:
text_location = os.path.join(os.getcwd(), 'corpus/')
filenames = [os.path.join(text_location, filename) for filename in os.listdir(text_location)]
batches = get_batch(filenames, BUFFER_SIZE, 100)
hey=pad_to_batch(next(batches))

In [13]:
hey[1].size()

torch.Size([100])

# Model

In [24]:
# embedding dimension 15
# filter widths = [1, 2, 3, 4, 5, 6, 7]
# filter dimens = [50, 100, 150, 200, 200, 200, 200]
# tanh
# highway network num 2
# relu activation

class Word2CNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, kernel_dims, kernel_sizes, 
                 dropout=0.5, highway_layers=2):
        super(Word2CNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, dim, (size, embedding_dim)) for dim, size in zip(kernel_dims, kernel_sizes)])
        self.dropout = nn.Dropout(dropout)
        internal_dim = sum(kernel_dims)
        self.hw_num_layers = highway_layers
        self.hw_nonlinear = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.hw_linear = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.hw_gate = nn.ModuleList([nn.Linear(internal_dim, internal_dim) for _ in range(highway_layers)])
        self.final_layer = nn.Linear(internal_dim * 2, 2)
        
    def forward(self, inputs, is_training=False):
        inputs = inputs.view(inputs.size()[0]*2, -1) # each word on a line [B, MAX_LENGTH]
        inputs = self.embeddings(inputs).unsqueeze(1) # [BATCH, 1, MAX_LENGTH, EM_SIZE]
        inputs = [F.tanh(conv(inputs)).squeeze(3) for conv in self.convs] # [BATCH, K_DIM, MAX_LENGTH]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] # [BATCH, K_DIM]*len(Ks)
        inputs = torch.cat(inputs, 1) # [BATCH, K_DIM*len(Ks)]
        if is_training:
            inputs = self.dropout(inputs)
        for layer in range(self.hw_num_layers):
            gate = F.sigmoid(self.hw_gate[layer](inputs))
            nonlinear = F.relu(self.hw_nonlinear[layer](inputs))
            linear = self.hw_linear[layer](inputs)
            inputs = gate * nonlinear + (1 - gate) * linear
        if is_training:
            inputs = self.dropout(inputs)
        inputs = inputs.view(-1, inputs.size()[1]*2)
        out = self.final_layer(inputs)
        return F.log_softmax(out, 1)

In [28]:
EPOCH = 5
BATCH_SIZE = 128
EMBEDDING_SIZE = 15
KERNEL_SIZES = [1, 2, 3, 4, 5, 6, 7]
KERNEL_DIMEN = [50, 100, 150, 200, 200, 200, 200]
LR = 0.001
vocab_size = len(char_to_index) + 1

In [29]:
model = Word2CNN(vocab_size, EMBEDDING_SIZE, KERNEL_DIMEN, KERNEL_SIZES)
if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [30]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(get_batch(filenames, BUFFER_SIZE, BATCH_SIZE)):
        inputs, targets = pad_to_batch(batch)
        model.zero_grad()
        preds = model(inputs, True)
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        optimizer.step()
        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/5] mean_loss : 0.70
[0/5] mean_loss : 0.64
[0/5] mean_loss : 0.53
[0/5] mean_loss : 0.55
[0/5] mean_loss : 0.55
[0/5] mean_loss : 0.57
[0/5] mean_loss : 0.48
[0/5] mean_loss : 0.53
[1/5] mean_loss : 0.65
[1/5] mean_loss : 0.56
[1/5] mean_loss : 0.50
[1/5] mean_loss : 0.52
[1/5] mean_loss : 0.53
[1/5] mean_loss : 0.56
[1/5] mean_loss : 0.47
[1/5] mean_loss : 0.52
[2/5] mean_loss : 0.72
[2/5] mean_loss : 0.55
[2/5] mean_loss : 0.49
[2/5] mean_loss : 0.52
[2/5] mean_loss : 0.52
[2/5] mean_loss : 0.55
[2/5] mean_loss : 0.47
[2/5] mean_loss : 0.51


KeyboardInterrupt: 