<a href="https://colab.research.google.com/github/FUJITOSHION/transformer/blob/master/fall_zemi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 候補
- refomerを行う => transfomerと比較
- 同等の精度で計算量の違いを表す。

## transfomerの問題点

attentionの計算量が多い
特に内積QK
refomerではlocality-sensitive hashing(LSH)にを使う。

## 注意
実際のtransformerはMluti-Head-attentionだが今回はsimple transfomerを使う。


## Transformerの構造
1. 文章の単語ID列
2. Embedderモジュール(word_id to word_vec)
3. positionalEncoder
4. TranformerBlock * 2
    1. LayerNormalization
    2. Attention(self-attention)
    3. Dropout
    4. Layer Normalization
    5. FeedForwad
    6. DropOut
5. clasificationHead

In [15]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

In [16]:
class Embedder(nn.Module):
    '''
    word_id to vector_id
    '''
    def __init__(self, text_embedding_vectors):
        super(Embeder, self).__init__()

        # 日本語の学習済みを用意する
        self.embeddings = nn.Embdding.from_pretraind(
            embeddings = text_embedding_vectors, freeze = True
        )
    
    def forward(self, x):
        x_vec = self.embeddings(x)
        return x_vec

In [17]:
class PositionalEncoder(nn.Module):
    '''
    単語の位置を示すベクトル情報を付加する
    '''
    def __init__(self, vec_dim, max_seq_len):
        super(PositionalEncoder, self).__init__()

        self.vec_dim = vec_dim # 単語ベクトルの次元数
        
        pe = torch.zeros(max_seq_len, vec_dim)

        for pos in range(max_seq_len):
            for i in range(0, vec_dim, 2):
                pe[pos, i]  = math.sin(pos/ (10000 ** ((2*i)/vec_dim)))
                pe[pos, i + 1]  = math.cos(pos/ (10000 ** ((2*i)/vec_dim)))

        self.pe = pe.unsqueeze(0)  #　ミニバッチの次元を追加

        self.pe.requires_grad = False #　勾配計算しない

    def forward(self, x):
        ret = math.sqrt(self.vec_dim)*x + self.pe
        return ret

In [18]:
class Attention(nn.Module):
    def __init__(self, d_model = 300):
        super().__init__()

        # 1dconv(本来は)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask):
        key = self.key(k)
        query = self.key(q)
        value = self.key(v)
        
        # 内積をとりqureyとkeyの関連度を計算する。
        weights = torch.matmul(q, k.transpose(1,2))

        # softmaxで0をとるため
        mask = mask.unsqueeze(1)
        weights = weights.masked_fill(mask == 0, -1e9)

        # softmaxで正規化(attention weight 0 ~ 1)
        normlized_weights = F.softmax(weights, dim=-1)  # attention weight
        output = torch.matmul(normlized_weights, value)

        output = self.out(output)

        return output, normlized_weights

In [19]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.linear_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.dropout(F.relu(x))
        return self.linear_2(x)

In [20]:
class TransfomerBlock(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()

        # layernormlize
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)

        self.attn = Attention(d_model)
        
        self.ff = FeedForward(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        x_normlized = self.norm_1(x)
        output, normalized_weights = self.attn(
            x_normlized, x_normlized, x_normlized, mask)
        
        x2 = x + self.dropout_1(output)

        x_normlized2 = self.norm_2(x2)
        
        output = x2 + self.dropout_2(self.ff(x_normlized2))

        return output, normalized_weights  # [output, attention weights]

In [21]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model=300, out_dim = 2):
        super().__init__()

        self.linear = nn.Linear(d_model, out_dim)

        # 重みを正規分布で初期化
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)

    def forward(self, x):
        x0 = x[:, 0, :]  # [batch_size, text_len, depth]
        out = self.linear(x0)
        return out

In [24]:
class TransformerClassification(nn.Module):
    def __init__(self, text_embedding_vectors, d_mode=300, max_seq_len=256,out_dim=2):
        super().__init__()

        self.net1 = Embedder(text_embedding_vectors)
        self.net2 = PositionalEncoder(d_model=d_model, max_seq_len=max_seq_len)
        self.net3_1 = TransfomerBlock(d_model=d_model)
        self.net3_2 = TransfomerBlock(d_model=d_model)
        self.net4 = ClassificationHead(d_model=d_model, out_dim = out_dim)

    def forward(self, x, mask):
        x1 = self.net1(x)  # 単語をベクトルに
        x2 = self.net2(x1)  # Positon情報を足し算
        x3_1, normlized_weights_1 = self.net3_1(
            x2, mask)  # Self-Attentionで特徴量を変換
        x3_2, normlized_weights_2 = self.net3_2(
            x3_1, mask)  # Self-Attentionで特徴量を変換
        x4 = self.net4(x3_2)  # 最終出力の0単語目を使用して、分類0-1のスカラーを出力
        return x4, normlized_weights_1, normlized_weights_2

In [23]:
class ReAttention(Attention):
    def __init__(self, d_model = 300):
        super().__init__()

    def forward(self, q, k, v, mask):
        '''
        locality-sensitive hashing (LSH)
        計算式は不明、後日勉強
        '''
        pass