# 第5回 宿題

## 課題: 文章の属性を制御したまま文章を生成するモデルの実装

- 課題2で実装したモデルを用いて, どれだけ指定した属性に沿って文章が生成できるかを競います.
- 演習で用いた映画レビューのデータセットを用い, 文章の属性を制御しながら生成をおこなえるモデルを構築してください.
- 学習させたモデルで肯定的なレビュー・否定的なレビューをそれぞれ500件 (合計1000件) 生成し, 行区切りでファイルに保存したものを提出してください (`sample_submission.txt`を参考にしてください).
    - 前半500行は肯定的なレビュー, 後半500行は否定的なレビューとしてください.
- 生成文の属性の評価には, 学習データで事前に訓練させたCNNテキスト識別モデルに対す予測結果の精度 (F1スコア) を用います.
    - CNNで使用する単語ID辞書は事前に`vocab.dump`として用意してあります (下コード参照). 必要であれば使用してください.

In [None]:
import os
try:
    from utils import Vocab
except ModuleNotFoundError: # iLect環境
    os.chdir('/root/userspace/chap5')
    from utils import Vocab

In [None]:
! head -10 sample_submission.txt

In [None]:
import pickle

VOCAB_PATH = './vocab.dump'

# 学習用データ
TRAIN_X_PATH = './data/styletransfer/train_x.txt'
TRAIN_Y_PATH = './data/styletransfer/train_y.txt'

# 検証用データ
VALID_X_PATH = './data/styletransfer/valid_x.txt'
VALID_Y_PATH = './data/styletransfer/valid_y.txt'

vocab = pickle.load(open(VOCAB_PATH, 'rb')) # 演習で用いたvocabと同じ形式です

##### 以下のサンプルコードを参考にしてください.

データセットの準備

In [None]:
import re
import math
import time

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence

try:
    from utils import Vocab
except ModuleNotFoundError: # iLect環境
    os.chdir('/root/userspace/chap5')
    from utils import Vocab

np.random.seed(34)
torch.manual_seed(34)

num_epochs = 2
batch_size = 32

embedding_size = 300 # 単語の埋め込み次元数
hidden_size = 300 # LSTMの隠れ層次元数
latent_z_size = 32  # 潜在変数の次元数
latent_c_size = 2   # 潜在コードの次元数
latent_size = latent_z_size + latent_c_size
n_filters = 100 # Discriminator (CNN) のフィルター数

max_length = 11
min_count = 1 # 出現数がMIN_COUNT未満の単語は<UNK>に置き換える

word_drop_rate = 0.5

PAD = 0
BOS = 1
EOS = 2
UNK = 3
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
BOS_TOKEN = '<S>'
EOS_TOKEN = '</S>'

beta = 0.1
lmd_c = 0.1
lmd_u = 0.1
lmd_z = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_data(path, n_data=10e+10):
    data = []
    for i, line in enumerate(open(path, encoding='utf-8')):
        words = line.strip().split()
        data.append(words)
        if i + 1 >= n_data:
            break
    return data

class DataLoader:
    # WRITE ME

vocab = Vocab({
    PAD_TOKEN: PAD,
    BOS_TOKEN: BOS,
    EOS_TOKEN: EOS,
    UNK_TOKEN: UNK,
}, UNK_TOKEN)

sens_train_X = load_data(TRAIN_X_PATH)
sens_valid_X = load_data(VALID_X_PATH)

vocab.build_vocab(sens_train_X, min_count)

train_X = [vocab.sentence_to_ids(sen) for sen in sens_train_X]
valid_X = [vocab.sentence_to_ids(sen) for sen in sens_valid_X]

train_Y = np.loadtxt(TRAIN_Y_PATH)
valid_Y = np.loadtxt(VALID_Y_PATH)

vocab_size = len(vocab.word2id)
print('語彙数:', vocab_size)
print('学習用データ数:', len(train_X))
print('検証用データ数:', len(valid_X))

train_X_stage1, train_X_stage2, train_Y_stage1, train_Y_stage2 = train_test_split(train_X, train_Y, test_size=0.1)

dataloader_train_stage1 = DataLoader(train_X_stage1, train_Y_stage1, batch_size)
dataloader_train_stage2 = DataLoader(train_X_stage2, train_Y_stage2, batch_size)

dataloader_valid = DataLoader(valid_X, valid_Y, batch_size)

モデルの定義

In [None]:
class Encoder(nn.Module):
    # WRITE ME
    
class Generator(nn.Module):
    # WRITE ME

class Discriminator(nn.Module):
    # WRITE ME

In [None]:
E_args = {
    # WRITE ME
}

G_args = {
    # WRITE ME
}

D_args = {
    # WRITE ME
}

E = Encoder(**E_args).to(device)
G = Generator(**G_args).to(device)
D = Discriminator(**D_args).to(device)

optimizer_E = optim.Adam(E.parameters())
optimizer_G = optim.Adam(G.parameters())
optimizer_D = optim.Adam(D.parameters())

損失関数

In [None]:
def sample_z_prior(batch_size):
    # WRITE ME

def sample_c_prior(batch_size):
    # WRITE ME

def compute_loss_vae(x, x_lens, lmd, use_c_prior=True, is_train=False):
    # WRITE ME

def compute_loss_s(x, c):
    # WRITE ME

def compute_loss_u():
    # WRITE ME

def compute_loss_attr():
    # WRITE ME

def get_kl_weight(step):
    # WRITE ME

VAEの学習

In [None]:
step = 0
start_time = time.time()
for epoch in range(num_epochs):
    # WRITE ME

def sample(batch_size, max_length, c=None):
    # WRITE ME

各componentの学習

In [None]:
def compute_loss_D(x, c, is_train=False):
    # WRITE ME
    return loss_D

def compute_loss_G(x, x_lens, is_train=False):
    # WRITE ME

def compute_loss_E(x, x_lens, is_train=False):
    # WRITE ME

start_time = time.time()
for epoch in range(num_epochs):
    # WRITE ME

文生成

In [None]:
E.eval()
G.eval()

generated_sentences = []
c_pos = torch.eye(2)[torch.ones(500, dtype=torch.long)]
c_neg = torch.eye(2)[torch.ones(500, dtype=torch.long)]

c = torch.cat([c_pos, c_neg], dim=0)

for c_n in c:
    x_hat = sample(1, max_length, c_n.unsqueeze(0))[0]
    x_hat = x_hat.cpu().numpy()[0]
    x_hat = ' '.join([vocab.id2word[i] for i in x_hat])
    
    generated_sentences.append(x_hat + '\n')

with open('tmp.txt', 'w') as f:
    f.writelines(generated_sentences)