<a href="https://colab.research.google.com/github/CAU-CAPSTONE-2-Friendly-Keyboard/Models/blob/main/%EB%B9%84%EC%86%8D%EC%96%B4%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 설치

In [38]:
!pip install transformers==4.1.1
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing

In [39]:
import math
from datasets import load_dataset
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer

## DataLoader

In [40]:
dataset = load_dataset("csv",data_files='./sample_data/dataset.txt',sep='|')



  0%|          | 0/1 [00:00<?, ?it/s]

In [41]:
dataset['train'][0]

{'text': '좌배 까는건 ㅇㅂ', 'label': 1}

## pretrained Model 불러오기

In [42]:


model = AutoModelForSequenceClassification.from_pretrained('JminJ/kcElectra_base_Bad_Sentence_Classifier')
tokenizer = AutoTokenizer.from_pretrained('JminJ/kcElectra_base_Bad_Sentence_Classifier')


## 데이터 전처리

In [43]:
tokenizer

PreTrainedTokenizerFast(name_or_path='JminJ/kcElectra_base_Bad_Sentence_Classifier', vocab_size=50135, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [44]:
def preprocessing(data):
  return tokenizer(data['text'])

In [45]:
%time tokenized_dataset = dataset.map(preprocessing,batched=True)



CPU times: user 17.3 ms, sys: 875 µs, total: 18.1 ms
Wall time: 26.5 ms


In [46]:
tokenized_dataset['train'][0]

{'text': '좌배 까는건 ㅇㅂ',
 'label': 1,
 'input_ids': [2, 2896, 4225, 14257, 4030, 116, 4770, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

Attention을 이용한 Masking 연결을 위한 기존 모델 attention 구조 및 연산 확인

In [4]:
layernorm = nn.LayerNorm(768)

In [5]:
layers = model.electra.encoder.layer
tmp_query = layers[0].attention.self.query.weight
for layer in layers[1:]:
  tmp_query = torch.add(tmp_query,layer.attention.self.query.weight)
  print(tmp_query)
query_result = layernorm(tmp_query)

print(query_result.shape)
print(query_result)

tensor([[-0.0313,  0.0637,  0.0118,  ...,  0.0840,  0.0239, -0.0226],
        [ 0.1325, -0.0024, -0.0972,  ...,  0.0238,  0.0708, -0.1140],
        [ 0.0134, -0.0985,  0.0316,  ...,  0.0356,  0.1798, -0.0309],
        ...,
        [ 0.1682,  0.0282, -0.0483,  ..., -0.0115, -0.0993,  0.0424],
        [ 0.0239, -0.0051,  0.1035,  ..., -0.0839, -0.0149, -0.1470],
        [ 0.0264, -0.0629,  0.1732,  ...,  0.1792,  0.0791, -0.0692]],
       grad_fn=<AddBackward0>)
tensor([[-0.0224,  0.0687,  0.0400,  ...,  0.1129,  0.0041, -0.0162],
        [ 0.1064, -0.0062, -0.1138,  ...,  0.0996,  0.1383, -0.1605],
        [ 0.1204, -0.1723,  0.0691,  ...,  0.0011,  0.1476, -0.0410],
        ...,
        [ 0.1530,  0.0350, -0.1209,  ..., -0.0819,  0.0037, -0.0195],
        [ 0.0744,  0.0670,  0.0794,  ..., -0.0786, -0.0490, -0.2046],
        [ 0.0226, -0.0184,  0.0607,  ...,  0.1737,  0.0227,  0.0250]],
       grad_fn=<AddBackward0>)
tensor([[-0.0212,  0.1165,  0.2464,  ...,  0.1626,  0.1504,  0.0431],


## 모델

In [7]:
class MaskingAttention(nn.Module):
  def __init__(self,config):
    self.num_att_heads = config.num_att_heads
    self.size_att_head = int(config.hidden_size/config.num_att_heads)
    self.size_all_head = self.num_att_heads * self.size_att_head
    self.query = nn.Linear(config.hidden_size, config.hidden_size)
    self.key = nn.Linear(config.hidden_size, config.hidden_size)
  
  def transpose_score(self,x):
    new_shape = x.size()[:-1] + (self.num_att_heads, self.size_att_head)
    x = x.view(*new_shape)
    return x.permute(0,2,1,3)

  def forward(self, query_hidden_states, key_hidden_states, value_hidden_states, attention_mask,
                query_att=False, key_att=False, multi_head=True):
    # to save weight information
    cls_info = {}

    # choose to use query_attention matrix when update attention_probs
    # all of hidden_states sizes are [32, 128, 768] and it belongs to selected encoder layers output
    if query_att:
        mixed_query_layer = self.query(query_hidden_states)
    else:
        mixed_query_layer = query_hidden_states[:, :, :]
    if key_att:
        mixed_key_layer = self.key(key_hidden_states)
    else:
        mixed_key_layer = key_hidden_states[:, :, :]
    mixed_value_layer = value_hidden_states[:, :, :]

    # block attention score cls to itself
    attention_mask[:, :, :, 0] -= 10000

    # apply multi-head attention
    if multi_head:
        query_layer = self.transpose_score(mixed_query_layer)
        key_layer = self.transpose_score(mixed_key_layer)
        value_layer = self.transpose_score(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        # attention_scores size becomes [batch, num_head, 1(cls), seq_length]
        attention_scores = attention_scores[:, :, 0:1, :]
        attention_scores = attention_scores / math.sqrt(self.size_att_head)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # context_layer size is [batch, num_head, 1(cls), head_size]
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        # context_layer size is [batch, 1(cls), all_head_size]
        attention_output = context_layer.view(*new_context_layer_shape)

        # attention size becomes [batch, 1(cls), num_head, seq_length]
        attention_probs = attention_probs.permute(0, 2, 1, 3).contiguous()
        attention_scores = attention_scores.permute(0, 2, 1, 3).contiguous()

    # apply single-head attention
    else:
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(mixed_query_layer, mixed_key_layer.transpose(-1, -2))
        # attention_scores size becomes [batch, 1(cls), seq_length]
        attention_scores = attention_scores[:, 0:1, :]
        attention_scores = attention_scores / math.sqrt(self.size_att_head)
        # reduce rank of attention mask
        attention_mask = attention_mask.squeeze(1)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # attention_output size is [batch, 1(cls), all_head_size]
        attention_output = torch.matmul(attention_probs, mixed_value_layer)

    cls_info['scores'] = attention_scores
    cls_info['probs'] = attention_probs

    # # reduce the rank of attention_output
    # attention_output = attention_output[:, 0]

    return attention_output, cls_info

In [None]:
class FriendlyModel(nn.Module):
  def __init__(self,config):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.model = AutoModelForSequenceClassification.from_pretrained('JminJ/kcElectra_base_Bad_Sentence_Classifier')
    self.layerNorm = nn.LayerNorm
    self.maskingAttention = MaskingAttention(config)
    self.classifier = model.classifier

  def mean_layer(self,selected):
    layers = model.electra.encoder.layer
    tmp_query = layers[0].attention[selected].weight
    for layer in layers[1:]:
      tmp_query = torch.add(tmp_query,layer.attention[selected].weight)
    query_result = self.layernorm(tmp_query)
    return query_result

  def forward(self,x):
    return x
