## initialize folder and logger

In [2]:
import torch
import os
from datetime import datetime
import logging

assert torch.cuda.is_available(), "torch.cuda is not available"

## create experiment folder
log_dir = os.path.join(os.getcwd(),'log')
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
exp_dir = os.path.join(log_dir, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)

## set up logger
log_file = os.path.join(exp_dir, 'train.log')
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
logger = logging.getLogger()
logger.handlers = []
logger.setLevel(logging.DEBUG)
logger.propagate = False
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.info('Start time: %s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

Start time: 2018-05-12 14:09:12


## load word embeddings

In [28]:
import numpy as np
import io
from dict import Dictionary

src_emb_file = './data/wiki.en.vec'
tgt_emb_file = './data/wiki.es.vec'
emb_dim = 300
max_voc = 200000

def load_embedding(file, tag):
    word2id = {}
    vectors = []
    with io.open(file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                split = line.split()
                assert len(split) == 2
                assert emb_dim == int(split[1])
            else:
                if len(word2id) >= max_voc:
                    break
                word, vect = line.rstrip().split(' ', 1)
                word = word.lower()
                vect = np.fromstring(vect, sep=' ')
                if np.linalg.norm(vect) == 0:
                    vect[0] = 0.01
                if word in word2id:
                    logger.warning("word %s appears twice in the %s embedding" % (word, 'source'))
                    continue
                else:
                    if not vect.shape == (emb_dim,):
                        logger.warning("invalid dimension (%i,) for %s word %s" % (vect.shape[0], tag, word))
                        continue
                    word2id[word] = len(word2id)
                    vectors.append(vect[None])
    assert len(word2id) == len(vectors)
    id2word = {v: k for k, v in word2id.items()}
    dico = Dictionary(id2word, word2id, tag)
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()
    embeddings = embeddings.cuda()
    logger.info("loaded %i pre-trained %s word embeddings" % (len(vectors), tag))
    
    return dico, embeddings

## load source embedding
src_dico, _src_emb = load_embedding(src_emb_file, 'en')
src_emb = torch.nn.Embedding(len(src_dico), emb_dim, sparse=True)
src_emb.weight.data.copy_(_src_emb)

## load target embedding
tgt_dico, _tgt_emb = load_embedding(tgt_emb_file, 'es')
tgt_emb = torch.nn.Embedding(len(tgt_dico), emb_dim, sparse=True)
tgt_emb.weight.data.copy_(_tgt_emb)

loaded 200000 pre-trained en word embeddings
loaded 200000 pre-trained es word embeddings


tensor([[-1.3075e-01, -8.7659e-02, -1.1427e-01,  ..., -4.0476e-02,
         -1.2293e-02,  4.2569e-02],
        [-3.6446e-01,  9.5962e-02, -1.6188e-01,  ..., -1.4986e-01,
          2.3584e-01,  1.8541e-01],
        [-5.9110e-02, -8.3343e-02, -9.3019e-02,  ..., -5.4064e-02,
          1.7285e-01,  1.6713e-01],
        ...,
        [ 3.2125e-01,  1.3622e-01, -5.0101e-01,  ...,  1.4182e-01,
          5.0989e-01,  2.2007e-01],
        [-4.6783e-01, -7.4949e-01, -4.4708e-02,  ...,  9.5594e-01,
         -3.6959e-01,  1.0554e-01],
        [-7.4782e-02, -3.6216e-01, -1.8766e-01,  ..., -2.3346e-01,
          6.2097e-02, -2.3693e-01]])