# 第3回 宿題

## 課題：ニューラル翻訳モデル

- 訓練データ（train_X, train_Y）でモデル訓練させた後、テストデータ（test_X）に対する翻訳文を生成し、結果をcsvファイルに出力してください。
- csvファイルは`sample_submission.csv`と同様に、一行に一文を、単語をスペースで分割して書き込んでください。
- ファイル名はsubmission.csvとしてください。
- 予測結果のtest_Yに対する精度（BLEU）で評価します。

In [None]:
! head sample_submission.csv

In [None]:
# サンプルコード

import random
import csv
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from nltk import bleu_score
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
try:
    from utils import Vocab
except ModuleNotFoundError:  # iLect環境
    import os
    os.chdir('/root/userspace/chap3/')
    from utils import Vocab

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(1)
random_state = 42

PAD = 0
UNK = 1
BOS = 2
EOS = 3
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
BOS_TOKEN = '<S>'
EOS_TOKEN = '</S>'


def load_data(file_path):
    data = []
    for line in open(file_path, encoding='utf-8'):
        words = line.strip().split()  # スペースで単語を分割
        data.append(words)
    return data


def load_dataset():
    # Load dataset
    train_X = load_data('./data/train.en')
    train_Y = load_data('./data/train.ja')
    test_X = load_data('./data/test.en')
    
    return train_X, train_Y, test_X


def sentence_to_ids(vocab, sentence):
    """
    単語のリストをインデックスのリストに変換する
    :param vocab: Vocabのインスタンス
    :param sentence: list of str
    :return indices: list of int
    """
    ids = [vocab.word2id.get(word, UNK) for word in sentence]
    ids = [BOS] + ids + [EOS]  # </S>トークンを末尾に加える
    return ids


def pad_seq(seq, max_length):
    # 系列(seq)が指定の文長(max_length)になるように末尾をパディングする
    res = seq + [PAD for i in range(max_length - len(seq))]
    return res    


class DataLoader(object):
    # WRITE ME!


class Encoder(nn.Module):
    # WRITE ME!


class Decoder(nn.Module):
    # WRITE ME!
    

class EncoderDecoder(nn.Module):
    # WRITE ME!


mce = nn.CrossEntropyLoss(size_average=False, ignore_index=PAD)
def masked_cross_entropy(logits, target):
    return mce(logits.view(-1, logits.size(-1)), target.view(-1))        


def compute_loss(batch_X, batch_Y, lengths_X, model, optimizer=None, is_train=True):
    # 損失を計算する関数
    model.train(is_train)  # train/evalモードの切替え
    
    # 一定確率でTeacher Forcingを行う
    use_teacher_forcing = is_train and (random.random() < teacher_forcing_rate)
    max_length = batch_Y.size(0)
    # 推論
    pred_Y = model(batch_X, lengths_X, max_length, batch_Y, use_teacher_forcing)
    
    # 損失関数を計算
    loss = masked_cross_entropy(pred_Y.contiguous(), batch_Y.contiguous())
    
    if is_train:  # 訓練時はパラメータを更新
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    batch_Y = batch_Y.transpose(0, 1).contiguous().data.cpu().tolist()
    pred = pred_Y.max(dim=-1)[1].data.cpu().numpy().T.tolist()

    return loss.item(), batch_Y, pred


def calc_bleu(refs, hyps):
    refs = [[ref[:ref.index(EOS)]] for ref in refs]
    hyps = [hyp[:hyp.index(EOS)] if EOS in hyp else hyp for hyp in hyps]
    return 100 * bleu_score.corpus_bleu(refs, hyps)


# ハイパーパラメータ
min_count = # WRITE ME!
hidden_size = # WRITE ME!
batch_size = # WRITE ME!
num_epochs = # WRITE ME!
lr = # WRITE ME!
teacher_forcing_rate = # WRITE ME!
test_max_length = # WRITE ME!


train_X, train_Y, test_X = load_dataset()

train_X, valid_X, train_Y, valid_Y = train_test_split(
    train_X, train_Y, test_size=0.1, random_state=42)


word2id = {
    PAD_TOKEN: PAD,
    BOS_TOKEN: BOS,
    EOS_TOKEN: EOS,
    UNK_TOKEN: UNK,
    }

vocab_X = Vocab(word2id=word2id)
vocab_Y = Vocab(word2id=word2id)
vocab_X.build_vocab(train_X, min_count=min_count)
vocab_Y.build_vocab(train_Y, min_count=min_count)

vocab_size_X = len(vocab_X.id2word)
vocab_size_Y = len(vocab_Y.id2word)

train_X = [sentence_to_ids(vocab_X, sentence) for sentence in train_X]
train_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in train_Y]
valid_X = [sentence_to_ids(vocab_X, sentence) for sentence in valid_X]
valid_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in valid_Y]

train_dataloader = DataLoader(train_X, train_Y, batch_size)
valid_dataloader = DataLoader(valid_X, valid_Y, batch_size, shuffle=False)

model_args = {
    'input_size': vocab_size_X,
    'output_size': vocab_size_Y,
    'hidden_size': hidden_size,
}

model = EncoderDecoder(**model_args).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# 訓練
best_valid_bleu = 0.

for epoch in range(1, num_epochs+1):
    train_loss = 0.
    train_refs = []
    train_hyps = []
    valid_loss = 0.
    valid_refs = []
    valid_hyps = []
    # train
    for batch in train_dataloader:
        batch_X, batch_Y, lengths_X = batch
        loss, gold, pred = compute_loss(
            batch_X, batch_Y, lengths_X, model, optimizer, 
            is_train=True
            )
        train_loss += loss
        train_refs += gold
        train_hyps += pred
    # valid
    for batch in valid_dataloader:
        # WRITE ME!
    
    # 損失をサンプル数で割って正規化
    train_loss /= len(train_dataloader.data) 
    valid_loss /= len(valid_dataloader.data) 
    # BLEUを計算
    train_bleu = calc_bleu(train_refs, train_hyps)
    valid_bleu = calc_bleu(valid_refs, valid_hyps)

    # validationデータでBLEUが改善した場合にはモデルを保存
    if valid_bleu > best_valid_bleu:
        ckpt = model.state_dict()
        best_valid_bleu = valid_bleu

    print('Epoch {}: train_loss: {:5.2f}  train_bleu: {:2.2f}  valid_loss: {:5.2f}  valid_bleu: {:2.2f}'.format(
            epoch, train_loss, train_bleu, valid_loss, valid_bleu))
    print('-'*80)


# 学習済みモデルで生成
model.load_state_dict(ckpt)  # 最良のモデルを読み込み
model.eval()

test_X = [sentence_to_ids(vocab_X, sentence) for sentence in test_X]
test_dataloader = DataLoader(test_X, test_X, 1, shuffle=False)  # 演習のDataLoaderをそのまま使う場合はYにダミーとしてtest_Xを与える

pred_Y = []
for batch in test_dataloader:
    batch_X, _, lengths_X = batch
    pred = model(batch_X, lengths_X, max_length=test_max_length)
    pred = pred.max(dim=-1)[1].view(-1).data.cpu().numpy().tolist()
    if EOS in pred:
        pred = pred[:pred.index(EOS)]
    pred_y = [vocab_Y.id2word[_id] for _id in pred] 
    pred_Y.append(pred_y)


with open('submission.csv', 'w') as f:
    writer = csv.writer(f, delimiter=' ', lineterminator='\n')
    writer.writerows(pred_Y)