# 第7回講義 宿題

### 課題
RNNを用いてIMDbのsentiment analysisを実装してみましょう．

ネットワークの形などに制限はとくになく，今回のLessonで扱った内容以外の工夫も組み込んでもらって構いません．

In [1]:
!pip install portalocker

import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import string
import re
from typing import List, Union

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


# 学習データ
x_train = np.load('/kaggle/input/imdb-dataset/x_train.npy', allow_pickle=True)
t_train = np.load('/kaggle/input/imdb-dataset/t_train.npy', allow_pickle=True)

# 検証データを取る
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.2, random_state=seed)

# テストデータ
x_test = np.load('/kaggle/input/imdb-dataset/x_test.npy', allow_pickle=True)


def text_transform(text: List[int], max_length=256):
    # <BOS>はすでに1で入っている．<EOS>は2とする．
    text = text[:max_length - 1] + [2]

    return text, len(text)

def collate_batch(batch):
    label_list, text_list, len_seq_list = [], [], []

    for sample in batch:
        if isinstance(sample, tuple):
            label, text = sample

            label_list.append(label)
        else:
            text = sample.copy()

        text, len_seq = text_transform(text)
        text_list.append(torch.tensor(text))
        len_seq_list.append(len_seq)

    # NOTE: 宿題用データセットでは<PAD>は3です．
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3).T, torch.tensor(len_seq_list)


word_num = np.concatenate(np.concatenate((x_train, x_test))).max() + 1
print(f"単語種数: {word_num}")

Collecting portalocker
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker
Successfully installed portalocker-3.1.1
単語種数: 88587


In [2]:
min_len = min(len(x) for x in x_train)
print(min_len) # Max 7 tokens
max_len = max(len(x) for x in x_train)
print(max_len) # Min 2494 tokens

7
2494


In [3]:
# (backbone = LSTM -> Attention)

In [4]:
x_train.shape, t_train.shape, x_valid.shape, t_valid.shape, x_test.shape

((32000,), (32000,), (8000,), (8000,), (10000,))

In [5]:
def seed(seed=1234):
  torch.manual_seed(seed)
  np.random.seed(seed)
  random.seed(seed)

### 実装

In [6]:
batch_size = 128

train_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_train, x_train)],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch,
)
valid_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_valid, x_valid)],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)
test_dataloader = DataLoader(
    x_test,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)

In [7]:
import torch.nn as nn

class MultiLayerBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, dropout, padding_idx):
        super().__init__()

        # 1. 埋め込み層
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)

        # 2. 多層双方向LSTM
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,      # レイヤー数を指定
            bidirectional=True,       # 双方向を有効化
            batch_first=True,
            dropout=dropout           # 層間のドロップアウトを指定
        )

        # 3. 分類器ヘッド (双方向なので hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        # 4. 追加のドロップアウト層
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        # text = [batch size, sent len]

        embedded = self.dropout(self.embedding(text))
        # embedded = [batch size, sent len, emb dim]

        # パディングを無視するためのpack処理 (オプションだが推奨)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True, enforce_sorted=False)

        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        # hidden = [num_layers * num_directions, batch_size, hid_dim]
        # 最後の層の隠れ状態を連結
        last_hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        # last_hidden = [batch size, hid dim * 2]

        return self.fc(last_hidden)

# モデルのインスタンス化
VOCAB_SIZE = 88587 # 例
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2  # 2層から試す
DROPOUT = 0.4 # ドロップアウト率
PADDING_IDX = 3

model = MultiLayerBiLSTM(VOCAB_SIZE,
                         EMBEDDING_DIM,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         DROPOUT,
                         PADDING_IDX)

In [8]:
# ===================================================================
# 1. モデル定義 (修正版)
# ===================================================================
class MultiLayerBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, dropout): # 修正: padding_idxを引数から削除
        super().__init__()
        
        # ✅ 修正: nn.Embeddingから padding_idx の指定を削除
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # 多層双方向LSTM
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout
        )
        
        # 分類器ヘッド
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        # ドロップアウト層
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        # (forwardメソッドの中身は変更なし)
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        last_hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(last_hidden)


# ===================================================================
# 2. ハイパーパラメータとセットアップ (修正版)
# ===================================================================
# --- ハイパーパラメータ ---
emb_dim = 200
hid_dim = 256
n_layers = 2
dropout = 0.4
n_epochs = 10
learning_rate = 1e-3
# padding_idx = 3 # この行は不要

# --- デバイス設定 ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- モデル、損失関数、オプティマイザの初期化 ---
# ✅ 修正: 新しいモデル定義に合わせてpadding_idxを渡さない
model = MultiLayerBiLSTM(
    vocab_size=word_num,
    embedding_dim=emb_dim,
    hidden_dim=hid_dim,
    output_dim=1,
    n_layers=n_layers,
    dropout=dropout
)
model.to(device) # この行でエラーが発生しなくなるはずです

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

best_valid_f1 = 0.0
best_epoch = 0
model_save_path = "best_model.pt"


# ===================================================================
# 3. 学習ループ
# ===================================================================
for epoch in range(n_epochs):
    losses_train = []
    losses_valid = []
    # --- 訓練パート ---
    model.train() # ✅ 修正: `net`ではなく`model`
    for label, line, len_seq in train_dataloader:
        
        # 勾配をリセット
        optimizer.zero_grad() # ✅ 修正: `net`ではなく`optimizer`

        # データをデバイスに移動
        t = label.to(device)
        x = line.to(device)
        # len_seqはpack_padded_sequenceでCPUにある必要があるため、ここでは移動しない

        # 順伝播
        # ✅ 修正: モデルのforward関数に合わせて引数を渡す
        logits = model(x, len_seq)
        
        # 損失計算 (BCEWithLogitsLossは (出力, ラベル) の順で引数を取る)
        # squeeze()で余分な次元を削除し、ラベルtの型をfloatに変換
        loss = criterion(logits.squeeze(), t.float())
        
        # 逆伝播
        loss.backward()

        # 勾配クリッピング
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # ✅ 修正: `net`ではなく`model`

        # ✅ 必須: optimizer.step() を呼び出し、重みを更新
        optimizer.step()

        losses_train.append(loss.item())

    # --- 検証パート ---
    t_valid = []
    y_pred = []
    model.eval() # ✅ 修正: `net`ではなく`model`
    
    # ✅ 改善: `with torch.no_grad()`で囲み、計算効率を向上させる
    with torch.no_grad():
        for label, line, len_seq in valid_dataloader:
            t = label.to(device)
            x = line.to(device)

            # 順伝播
            logits = model(x, len_seq) # ✅ 修正: `net`ではなく`model`

            # 損失計算
            loss = criterion(logits.squeeze(), t.float())
            
            # 予測ラベルを計算 (ロジットにSigmoidを適用し、0.5を閾値とする)
            pred = (torch.sigmoid(logits) > 0.5).long().squeeze()

            t_valid.extend(t.tolist())
            y_pred.extend(pred.tolist())
            losses_valid.append(loss.item())

    # --- エポックごとの結果を表示 ---
    print('EPOCH: {}, Train Loss: {:.3f}, Valid Loss: {:.3f}, Validation F1: {:.3f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
        f1_score(t_valid, y_pred, average='macro')
    ))
    current_f1 = f1_score(t_valid, y_pred, average='macro')
    
    print('EPOCH: {}, Train Loss: {:.3f}, Valid Loss: {:.3f}, Validation F1: {:.3f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
        current_f1
    ))
    
    # ✅ アーリーストッピングのロジック
    if current_f1 > best_valid_f1:
        best_valid_f1 = current_f1
        best_epoch = epoch
        # 最良モデルのパラメータを保存
        torch.save(model.state_dict(), model_save_path)
        print(f"✨ New best model saved at epoch {epoch} with F1: {current_f1:.4f}")

# 学習後
print(f"\nTraining finished. Best model was at epoch {best_epoch} with F1 score: {best_valid_f1:.4f}")
# 最良モデルをロードして評価に使用
# model.load_state_dict(torch.load(model_save_path))

EPOCH: 0, Train Loss: 0.641, Valid Loss: 0.535, Validation F1: 0.741
EPOCH: 0, Train Loss: 0.641, Valid Loss: 0.535, Validation F1: 0.741
✨ New best model saved at epoch 0 with F1: 0.7412
EPOCH: 1, Train Loss: 0.555, Valid Loss: 0.653, Validation F1: 0.606
EPOCH: 1, Train Loss: 0.555, Valid Loss: 0.653, Validation F1: 0.606
EPOCH: 2, Train Loss: 0.448, Valid Loss: 0.449, Validation F1: 0.817
EPOCH: 2, Train Loss: 0.448, Valid Loss: 0.449, Validation F1: 0.817
✨ New best model saved at epoch 2 with F1: 0.8167
EPOCH: 3, Train Loss: 0.349, Valid Loss: 0.348, Validation F1: 0.856
EPOCH: 3, Train Loss: 0.349, Valid Loss: 0.348, Validation F1: 0.856
✨ New best model saved at epoch 3 with F1: 0.8560
EPOCH: 4, Train Loss: 0.299, Valid Loss: 0.305, Validation F1: 0.874
EPOCH: 4, Train Loss: 0.299, Valid Loss: 0.305, Validation F1: 0.874
✨ New best model saved at epoch 4 with F1: 0.8736
EPOCH: 5, Train Loss: 0.257, Valid Loss: 0.352, Validation F1: 0.860
EPOCH: 5, Train Loss: 0.257, Valid Loss: 

In [9]:
import pandas as pd
import torch

# 1. 最良モデルの読み込みと評価モードへの切り替え
# (アーリーストッピングで保存したモデルパスを指定)
model_save_path = "best_model.pt"
model.load_state_dict(torch.load(model_save_path))
model.eval()

# 2. 予測結果を保存するリストを初期化
y_pred = []

# 3. `torch.no_grad()`で勾配計算を無効化
with torch.no_grad():
    # ✅ 修正: test_dataloaderはラベル(t)を返さないため、アンパックを修正
    # test_dataloaderは (line, len_seq) を返す
    for _, line, len_seq in test_dataloader:
        
        # ✅ 修正: データをデバイスに移動
        x = line.to(device)
        # len_seq はCPU上にある必要があるため、GPUには移動しない

        # 順伝播
        logits = model(x, len_seq)
        
        # ✅ 修正: 未定義変数hではなく、モデルの出力logitsを使用する
        # ロジットにSigmoidを適用して確率に変換
        y = torch.sigmoid(logits)

        # 確率を0.5を閾値として0か1の予測ラベルに変換
        # .squeeze()はバッチサイズが1の場合にエラーを起こす可能性があるため、
        # .reshape(-1)で常に1次元ベクトルにしてからリストに変換するのが安全
        pred = y.round().reshape(-1)

        # 予測結果をリストに追加
        y_pred.extend(pred.cpu().tolist())

# 4. 提出用ファイルの作成
print(f"Total predictions: {len(y_pred)}")
submission = pd.Series(y_pred, name='label', dtype=int)
submission.to_csv('submission.csv', header=True, index_label='id')

print("submission.csv has been created successfully.")

Total predictions: 10000
submission.csv has been created successfully.
