In [1]:
import numpy as np

In [2]:
# TODO: implement k-nn search exploiting hubness reduction

# simple search is only available
def test_acc(prj_mat, src_emb, trg_emb, valid_data, k):
    raise

In [None]:
import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links as L


class Generator(chainer.Chain):
    def __init__(self, dim):
        self.dim = dim
        super(Generator, self).__init__(
            W = L.Linear(dim, dim)
        )

    def __call__(self, src_emb):
        """
        simple projection
        """
        src_emb = chainer.Variable(src_emb)
        return self.W(src_emb)


class Discriminator(chainer.Chain):
    def __init__(self, dim, hidden=2048, drop_rate=0.1):
        self.dim = dim
        self.hidden = hidden
        self.drop_rate = drop_rate
        super(Discriminator, self).__init__(
            W1 = L.Linear(dim, hidden),
            W2 = L.Linear(hidden, 2)
        )

    def __call__(self, xs):
        # TODO: adding noise?
        xs = F.dropout(xs, ratio=self.drop_rate)
        hs = F.leaky_relu(self.W1(xs))
        return self.W2(hs)

In [None]:
import chainer
import chainer.functions as F
import numpy as np


class GANTrainer(object):
    def __init__(self, models, opts, logger, valid_data, epoch, batchsize):
        self.gen, self.dis = models
        self.gen_opt, self.dis_opt = opts
        self.logger = logger
        self.epoch = epoch
        self.batchsize = batchsize
        self.valid_data = valid_data

    def fit(self, src_emb, trg_emb):
        n_src_word = len(src_emb)
        n_trg_word = len(trg_emb)
        n_sample = max(n_src_word, n_trg_word)
        for epoch in range(self.epoch):
            self.logger.info('start {} epoch'.format(epoch+1))
            sum_gen_loss = 0
            sum_dis_loss = 0
            for i in range(n_sample // self.batchsize):
                src_idxs = np.random.randint(0, n_src_word, size=self.batchsize)
                trg_idxs = np.random.randint(0, n_trg_word, size=self.batchsize)
                _src_emb = src_emb[src_idxs]
                _trg_emb = trg_emb[trg_idxs]

                prj_src_emb = self.gen(_src_emb)
                src_ys = self.dis(prj_src_emb)
                gen_loss = F.softmax_cross_entropy(src_ys, chainer.Variable(np.zeros(self.batchsize, dtype=np.int32)))
                dis_loss = F.softmax_cross_entropy(src_ys, chainer.Variable(np.ones(self.batchsize, dtype=np.int32)))

                trg_ys = self.dis(_trg_emb)
                dis_loss += F.softmax_cross_entropy(trg_ys, chainer.Variable(np.zeros(self.batchsize, dtype=np.int32)))

                # update for generator
                self.gen.zerograds()
                gen_loss.backward()
                self.gen_opt.update()

                # update for discriminator
                self.dis.zerograds()
                dis_loss.backward()
                self.dis_opt.update()

                sum_gen_loss += gen_loss.data
                sum_dis_loss += dis_loss.data

            self.logger.info('    generator loss: {}'.format(sum_gen_loss))
            self.logger.info('discriminator loss: {}'.format(sum_dis_loss))

In [None]:
import numpy as np


np.random.seed(46)

# generating synthetic data with random projection
def gen_synthetic_data(d, prj_d, n):
    z = np.random.normal(loc=0., scale=1.0, size=(n, d))
    r_x = np.random.uniform(-1, 1, size=(d, prj_d))
    r_y = np.random.uniform(-1, 1, size=(d, prj_d))
    x = z.dot(r_x) + np.random.randn(n, prj_d)  # true_x + gaussian noise
    y = z.dot(r_y) + np.random.randn(n, prj_d)
    return x.astype(np.float32), y.astype(np.float32)

In [3]:
import argparse
from chainer import optimizers
from datetime import datetime
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import logging
import os
from sklearn.model_selection import train_test_split

from net import Generator, Discriminator
from trainer import GANTrainer
from utils import gen_synthetic_data


np.random.seed(46)
DIM = 3000
DIM_EMB = 300
NUM = 10000
DEFAULT_LOG_DIR = os.path.join(os.path.abspath(os.path.dirname('deeplog')),
                               '{}'.format(datetime.now().strftime('%Y%m%d_%H:%M')))


def train(args):
    # setting for logging
    if not os.path.exists(args.log):
        os.mkdir(args.log)
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    log_path = os.path.join(args.log, 'log')
    file_handler = logging.FileHandler(log_path)
    fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(fmt)
    logger.addHandler(file_handler)

    logger.info('Arguments...')
    for arg, val in vars(args).items():
        logger.info('{:>10} -----> {}'.format(arg, val))

    x, y = gen_synthetic_data(DIM, DIM_EMB, NUM)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
    valid_x, test_x, valid_y, test_y = train_test_split(test_x, test_y, test_size=0.5)

    gen = Generator(DIM_EMB)
    dis = Discriminator(DIM_EMB)

    gen_opt = optimizers.Adam()
    dis_opt = optimizers.Adam()

    gen_opt.setup(gen)
    dis_opt.setup(dis)

    trainer = GANTrainer((gen, dis), (gen_opt, dis_opt), logger, (valid_x, valid_y), args.epoch)
    trainer.fit(train_x, train_y)




In [8]:
p = argparse.ArgumentParser("Adversarial object matching with synthetic data")
p.add_argument('--src', type=str, help='embedding file of source language')
p.add_argument('--trg', type=str, help='embedding file of target language')
p.add_argument('--epoch', type=int, default=100, help='number of epoch')
p.add_argument('--batchsize', type=int, default=32, help='minibatch size')
p.add_argument('--log', type=str, default=DEFAULT_LOG_DIR, help='log dir')

#train(p.parse_args())

_StoreAction(option_strings=['--log'], dest='log', nargs=None, const=None, default='/home/dz/code/Python/deeplearn/hw3/20171118_14:16', type=<class 'str'>, choices=None, help='log dir', metavar=None)

In [None]:
x, y = gen_synthetic_data(DIM, DIM_EMB, NUM)

In [None]:
x.shape, y.shape

In [7]:
p

ArgumentParser(prog='Adversarial object matching with synthetic data', usage=None, description=None, formatter_class=<class 'argparse.HelpFormatter'>, conflict_handler='error', add_help=True)

In [9]:
train(p.parse_args())

usage: Adversarial object matching with synthetic data [-h] [--epoch EPOCH]
                                                       [--batchsize BATCHSIZE]
                                                       [--log LOG]
                                                       data.csv trg.csv
Adversarial object matching with synthetic data: error: the following arguments are required: trg.csv


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [13]:
p.parse_args()

usage: Adversarial object matching with synthetic data [-h] [--epoch EPOCH]
                                                       [--batchsize BATCHSIZE]
                                                       [--log LOG]
                                                       data.csv trg.csv
Adversarial object matching with synthetic data: error: the following arguments are required: trg.csv


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
