## initialize folder and logger

In [1]:
import torch
import os
from datetime import datetime
import logging

assert torch.cuda.is_available(), "torch.cuda is not available"

## create experiment folder
log_dir = os.path.join(os.getcwd(),'log')
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
exp_dir = os.path.join(log_dir, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)

## set up logger
log_file = os.path.join(exp_dir, 'train.log')
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
logger = logging.getLogger()
logger.handlers = []
logger.setLevel(logging.DEBUG)
logger.propagate = False
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.info('Start time: %s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

Start time: 2018-05-18 16:07:04


## set hyper-parameters

In [2]:
src_emb_file = './data/wiki.en.vec'
tgt_emb_file = './data/wiki.es.vec'
src_lang = 'en'
tgt_lang = 'es'
emb_dim = 300
max_voc = 200000

dis_hid_dim = 2048
dis_dropout = 0.
dis_input_dropout = 0.1
lr = 0.1
decay = 0.95

n_epochs = 50
epoch_size = 200000
batch_size = 32
dis_steps = 5
dis_most_frequent = 75000
dis_smooth = 0.1
map_beta = 0.001

logger.info('===== Argument List =====')
logger.info('Source Language: %s' % src_lang)
logger.info('Target Language: %s' % tgt_lang)
logger.info('Embedding Dimension: %i' % emb_dim)
logger.info('Vocabulary Size (for both): %i' % max_voc)
logger.info('Discriminator Hidden Layer Dimension: %i' % dis_hid_dim)
logger.info('Discriminator Hidden Dropout: %.2f' % dis_dropout)
logger.info('Discriminator Input Dropout: %.2f' % dis_input_dropout)
logger.info('Learning Rate: %.2f' % lr)
logger.info('Decay: %.2f' % decay)
logger.info('Number of Epochs: %i' % n_epochs)
logger.info('Number of Iterations per Epoch: %i' % epoch_size)
logger.info('Batch Size: %i' % epoch_size)
logger.info('Number of Steps for Discriminator: %i' % epoch_size)
logger.info('Number of Most Frequent Words Fed into Discriminator: %i' % dis_most_frequent)
logger.info('Discriminator Smothiness: %.1f' % dis_smooth)
logger.info('Orthogonality Update Coefficient: %.3f' % map_beta)

===== Argument List =====
Source Language: en
Target Language: es
Embedding Dimension: 300
Vocabulary Size (for both): 200000
Discriminator Hidden Layer Dimension: 2048
Discriminator Hidden Dropout: 0.00
Discriminator Input Dropout: 0.10
Learning Rate: 0.10
Decay: 0.95
Number of Epochs: 50
Number of Iterations per Epoch: 200000
Batch Size: 200000
Number of Steps for Discriminator: 200000
Number of Most Frequent Words Fed into Discriminator: 75000
Discriminator Smothiness: 0.1
Orthogonality Update Coefficient: 0.001


## load word embeddings

In [3]:
import numpy as np
import io
from dict import Dictionary

def load_embedding(file, tag):
    word2id = {}
    vectors = []
    with io.open(file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                split = line.split()
                assert len(split) == 2
                assert emb_dim == int(split[1])
            else:
                if len(word2id) >= max_voc:
                    break
                word, vect = line.rstrip().split(' ', 1)
                word = word.lower()
                vect = np.fromstring(vect, sep=' ')
                if np.linalg.norm(vect) == 0:
                    vect[0] = 0.01
                if word in word2id:
                    logger.warning("word %s appears twice in the %s embedding" % (word, 'source'))
                    continue
                else:
                    if not vect.shape == (emb_dim,):
                        logger.warning("invalid dimension (%i,) for %s word %s" % (vect.shape[0], tag, word))
                        continue
                    word2id[word] = len(word2id)
                    vectors.append(vect[None])
    assert len(word2id) == len(vectors)
    id2word = {v: k for k, v in word2id.items()}
    dico = Dictionary(id2word, word2id, tag)
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()
    embeddings = embeddings.cuda()
    logger.info("loaded %i pre-trained %s word embeddings" % (len(vectors), tag))
    
    return dico, embeddings

## load source embedding
src_dico, _src_emb = load_embedding(src_emb_file, src_lang)
src_emb = torch.nn.Embedding(len(src_dico), emb_dim, sparse=True)
src_emb.weight.data.copy_(_src_emb)

## load target embedding
tgt_dico, _tgt_emb = load_embedding(tgt_emb_file, tgt_lang)
tgt_emb = torch.nn.Embedding(len(tgt_dico), emb_dim, sparse=True)
tgt_emb.weight.data.copy_(_tgt_emb)

loaded 200000 pre-trained en word embeddings
loaded 200000 pre-trained es word embeddings


tensor([[-1.3075e-01, -8.7659e-02, -1.1427e-01,  ..., -4.0476e-02,
         -1.2293e-02,  4.2569e-02],
        [-3.6446e-01,  9.5962e-02, -1.6188e-01,  ..., -1.4986e-01,
          2.3584e-01,  1.8541e-01],
        [-5.9110e-02, -8.3343e-02, -9.3019e-02,  ..., -5.4064e-02,
          1.7285e-01,  1.6713e-01],
        ...,
        [ 3.2125e-01,  1.3622e-01, -5.0101e-01,  ...,  1.4182e-01,
          5.0989e-01,  2.2007e-01],
        [-4.6783e-01, -7.4949e-01, -4.4708e-02,  ...,  9.5594e-01,
         -3.6959e-01,  1.0554e-01],
        [-7.4782e-02, -3.6216e-01, -1.8766e-01,  ..., -2.3346e-01,
          6.2097e-02, -2.3693e-01]])

## model architecture

In [4]:
# Generator
mapping = torch.nn.Linear(emb_dim, emb_dim, bias=False)
mapping.weight.data.copy_(torch.diag(torch.ones(emb_dim)))

# Discriminator
class Discriminator(torch.nn.Module):
    
    def __init__(self):
        super(Discriminator, self).__init__()
        self.layers = [torch.nn.Dropout(dis_input_dropout)]
        self.layers.append(torch.nn.Linear(emb_dim, dis_hid_dim))
        self.layers.append(torch.nn.LeakyReLU(0.2))
        self.layers.append(torch.nn.Dropout(dis_dropout))
        self.layers.append(torch.nn.Linear(dis_hid_dim, dis_hid_dim))
        self.layers.append(torch.nn.LeakyReLU(0.2))
        self.layers.append(torch.nn.Dropout(dis_dropout))
        self.layers.append(torch.nn.Linear(dis_hid_dim, 1))
        self.layers.append(torch.nn.Sigmoid())
        self.layers = torch.nn.Sequential(*self.layers)

    def forward(self, x):
            assert x.dim() == 2 and x.size(1) == emb_dim
            return self.layers(x).view(-1)

discriminator = Discriminator()
        
# Cuda
src_emb.cuda()
tgt_emb.cuda()
mapping.cuda()
discriminator.cuda()

Discriminator(
  (layers): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=300, out_features=2048, bias=True)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Dropout(p=0.0)
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): LeakyReLU(negative_slope=0.2)
    (6): Dropout(p=0.0)
    (7): Linear(in_features=2048, out_features=1, bias=True)
    (8): Sigmoid()
  )
)

In [5]:
from torch import optim
from torch.optim.lr_scheduler import LambdaLR
optimizer_g = optim.SGD(mapping.parameters(), lr)
optimizer_d = optim.SGD(discriminator.parameters(), lr)
lambda_ = lambda epoch: decay ** epoch
scheduler_g = LambdaLR(optimizer_g, lr_lambda=[lambda_])
scheduler_d = LambdaLR(optimizer_d, lr_lambda=[lambda_])

## train

In [None]:
from torch.autograd import Variable
from torch.nn import functional
from evaluator import get_crosslingual_wordsim_scores
from evaluator import get_word_translation_accuracy
from evaluator import get_unsupervised_evaluation

assert dis_most_frequent <= min(len(src_dico), len(tgt_dico))

def get_xy(src_emb, tgt_emb, vol):
    src_ids = torch.LongTensor(batch_size).random_(dis_most_frequent).cuda()
    tgt_ids = torch.LongTensor(batch_size).random_(dis_most_frequent).cuda()
    src_emb_ = src_emb(Variable(src_ids))
    tgt_emb_ = tgt_emb(Variable(tgt_ids))
    src_emb_ = mapping(Variable(src_emb_.data))
    tgt_emb_ = Variable(tgt_emb_.data)
    x = torch.cat([src_emb_, tgt_emb_], 0)
    y = torch.FloatTensor(2 * batch_size).zero_()
    y[:batch_size] = 1 - dis_smooth
    y[batch_size:] = dis_smooth
    y = Variable(y.cuda())
    return x, y

eval_list = []
logger.info('===== ADVERSARIAL TRAINING =====')
for epoch in range(n_epochs):
    logger.info('\nstart epoch %i' % epoch)
    
    for i_iter in range(0, epoch_size, batch_size):
        
        ## discriminiator
        for i_dis in range(dis_steps):
            discriminator.train()
            x, y = get_xy(src_emb, tgt_emb, True)
            preds = discriminator(Variable(x.data))
            loss = functional.binary_cross_entropy(preds, y)
            optimizer_d.zero_grad()
            loss.backward()
            optimizer_d.step()
            
        if i_iter % 3000 == 0:
            logger.info('iteration %s, loss %.4f' % (i_iter, loss.data.item()))
            
        ## generator
        discriminator.eval()
        x, y = get_xy(src_emb, tgt_emb, False)
        preds = discriminator(x)
        loss = functional.binary_cross_entropy(preds, 1 - y)
        optimizer_g.zero_grad()
        loss.backward()
        optimizer_g.step()
        W = mapping.weight.data
        W.copy_((1 + map_beta) * W - map_beta * W.mm(W.transpose(0, 1).mm(W)))
        
        if i_iter == 0:
            optimizer_d.zero_grad()
            scheduler_d.step()
            optimizer_g.zero_grad()
            scheduler_g.step()
        
    # unsupervised evaluation metric
    unsupervised_score = get_unsupervised_evaluation(src_dico.word2id, mapping(src_emb.weight).data, tgt_dico.word2id, tgt_emb.weight.data, 10)
    if len(eval_list)>0 and unsupervised_score < min(eval_list):
        for g in optimizer_g.param_groups:
            g['lr'] = g['lr']/2
        for g in optimizer_d.param_groups:
            g['lr'] = g['lr']/2
    eval_list.append(unsupervised_score)
    logger.info("csls unsupervised metric score: %.5f" % unsupervised_score)
    
    # cross-lingual similarity evaluation
    src_tgt_ws_score = get_crosslingual_wordsim_scores(src_lang, src_dico.word2id, mapping(src_emb.weight).data.cpu().numpy(), 
                                                        tgt_lang, tgt_dico.word2id, tgt_emb.weight.data.cpu().numpy())
    logger.info("cross-lingual word similarity score average: %.5f" % src_tgt_ws_score)
    
    # word translation evaluation
    word_translation_nn_result = get_word_translation_accuracy(src_lang, src_dico.word2id, mapping(src_emb.weight).data, 
                    tgt_lang, tgt_dico.word2id, tgt_emb.weight.data, 'nn')
    word_translation_csls_result = get_word_translation_accuracy(src_lang, src_dico.word2id, mapping(src_emb.weight).data, 
                    tgt_lang, tgt_dico.word2id, tgt_emb.weight.data, 'csls_knn_10')

===== ADVERSARIAL TRAINING =====

start epoch 0
iteration 0, loss 0.6626
iteration 12000, loss 0.4874
iteration 24000, loss 0.5148
iteration 36000, loss 0.4164
iteration 48000, loss 0.4684
iteration 60000, loss 0.4093
iteration 72000, loss 0.4104
iteration 84000, loss 0.4136
iteration 96000, loss 0.4455
iteration 108000, loss 0.4805
iteration 120000, loss 0.3914
iteration 132000, loss 0.4168
iteration 144000, loss 0.4113
iteration 156000, loss 0.3925
iteration 168000, loss 0.3837
iteration 180000, loss 0.4212
iteration 192000, loss 0.4034
csls unsupervised metric score: -0.01453
cross-lingual word similarity score average: 0.24641
found 2975 pairs of words in the dictionary. 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)
2975 source words - nn - Precision at k = 1: 0.369748
1500 unique source words - nn - Precision at k = 1: 0.733333
2975 source words - nn - Precision at k = 5: 1.075630
1500 unique source words - nn - Precision at k = 5: 2.000000
2975 source

iteration 60000, loss 0.3861
iteration 72000, loss 0.4044
iteration 84000, loss 0.3845
iteration 96000, loss 0.4049
iteration 108000, loss 0.4086
iteration 120000, loss 0.3743
iteration 132000, loss 0.3807
iteration 144000, loss 0.3832
iteration 156000, loss 0.3661
iteration 168000, loss 0.3708
iteration 180000, loss 0.3933
iteration 192000, loss 0.4034
csls unsupervised metric score: 0.11282
cross-lingual word similarity score average: 0.68847
found 2975 pairs of words in the dictionary. 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)
2975 source words - nn - Precision at k = 1: 27.697479
1500 unique source words - nn - Precision at k = 1: 54.933333
2975 source words - nn - Precision at k = 5: 49.008403
1500 unique source words - nn - Precision at k = 5: 68.400000
2975 source words - nn - Precision at k = 10: 56.605042
1500 unique source words - nn - Precision at k = 10: 71.866667
found 2975 pairs of words in the dictionary. 0 other pairs contained at least 

iteration 132000, loss 0.3608
iteration 144000, loss 0.3708
iteration 156000, loss 0.3645
iteration 168000, loss 0.3639
iteration 180000, loss 0.3627
iteration 192000, loss 0.3662
csls unsupervised metric score: 0.08568
cross-lingual word similarity score average: 0.64951
found 2975 pairs of words in the dictionary. 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)
2975 source words - nn - Precision at k = 1: 25.445378
1500 unique source words - nn - Precision at k = 1: 50.466667
2975 source words - nn - Precision at k = 5: 45.512605
1500 unique source words - nn - Precision at k = 5: 66.133333
2975 source words - nn - Precision at k = 10: 53.647059
1500 unique source words - nn - Precision at k = 10: 71.600000
found 2975 pairs of words in the dictionary. 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)
2975 source words - csls_knn_10 - Precision at k = 1: 29.680672
1500 unique source words - csls_knn_10 - Precision at k = 1: 58.866667