# Simple Text Classification using BERT and Keras

- toxic  
- severe_toxic  
- obscene  
- threat  
- insult  
- identity_hate

- install BERT tokenizer from the BERT python module

In [None]:
!pip install bert-for-tf2

In [None]:
!pip install sentencepiece

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.0.0
Hub version:  0.8.0


## BERT Embedding Layer

- tf.hub 의 pre-trained parameter 로 BERT model 초기화 하고, downstream task 이 labeled data 로 모든 parameter 를 fine-tunning 한다.

BERT model 은 3 가지의 input token 을 필요로 한다.

- input_word_ids : vocaburary 를 이용하여 input token 을 index 로 convert  
- input_mask : 0 for padding. 1 for valid tokens  
- segment_ids : 0 는 1st segment, 1 은 2nd segment

pooped_out 은 전체 input sequence 를 표현하고, sequence_output 은 context 의 각 input token 을 표현한다.

In [5]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

MAX_SEQ_LEN=128
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32)
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32)
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32)

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
print(pooled_output)
print(sequence_output)

Tensor("keras_layer_1/Identity:0", shape=(None, 768), dtype=float32)
Tensor("keras_layer_1/Identity_1:0", shape=(None, None, 768), dtype=float32)


## Model creation 
pre-trained model 에 simple classification layer 를 추가

In [8]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(6, activation="sigmoid")(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=out)


model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [9]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_4[0][0]                    
                                                                 input_5[0][0]              

## Tokenization
- BERT 는 30,000 개의 token vocabulary 를 가진 WordPiece Embedding 사용  
- original vocab file 을 이용하여 tokenizer import

In [10]:
FullTokenizer=bert.bert_tokenization.FullTokenizer

vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()

do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file, do_lower_case)

In [11]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [12]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

## Preparing Training Data

In [13]:
import os
os.environ['KAGGLE_USERNAME'] = "trimurt001" # username from the json file
os.environ['KAGGLE_KEY'] = "b8265fc98880a5f19a51ac0bf2573552" # key from the json file

In [14]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

'kaggle'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [15]:
!unzip train.csv.zip
!unzip test.csv.zip	


'unzip'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.
'unzip'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [18]:
!ls

sample_data		   test.csv	 test_labels.csv.zip  train.csv.zip
sample_submission.csv.zip  test.csv.zip  train.csv


In [41]:
df.sample?

In [42]:
import pandas as pd

df=pd.read_csv('train.csv')

df = df.sample(frac=1)   # frac = 1 : 100%
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
127382,a944167489a27b8f,"""\n\nThis is taken from the article cited: """"T...",0,0,0,0,0,0
33821,5a32893b83d5ba66,You have only accused me here and no explanati...,0,0,0,0,0,0
82639,dd0e1bed2a93f115,"WP:SOFIXIT.\nTake care, however, not to remove...",0,0,0,0,0,0
117196,7245807e1ae3b573,Why was deMause's comment copied into the arti...,0,0,0,0,0,0
44465,76bee0f6c212f29c,"""\n\nI'm shocked at your """"holier than thou"""" ...",0,0,0,0,0,0


In [20]:
train_sentences = df["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = df[list_classes].values

- 각 sequence 의 1st token 은 언제나 [CLS] special classification token 이다. [SEP] 은 special separator token 이다.

In [21]:
def create_single_input(sentence, MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

In [22]:
def create_input_array(sentences):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

bert_layer 의 sequence_output 을 AveragePooling layer 에 통과사키고 6 unit 의 output layer 에 통과시켜 6 class classification 을 한다.

In [23]:
inputs = create_input_array(train_sentences)

100%|██████████| 159571/159571 [02:49<00:00, 944.16it/s]


In [24]:
model.fit(inputs, train_y, epochs=1, batch_size=32, validation_split=0.2, shuffle=True)



<tensorflow.python.keras.callbacks.History at 0x7f328e8e6240>

## Predict
- new text data 를 predict 하기 위해 BERT input 으로 변환 후 predict() method 로 predict

- [toxic, severe_toxic, obscene, threat, insult, identity_hate]

In [31]:
test_df=pd.read_csv("test.csv")

test_sentences = test_df["comment_text"].fillna("CVxTz").values

test_inputs=create_input_array(test_sentences[110:150])

print(model.predict(test_inputs)[0])


100%|██████████| 40/40 [00:00<00:00, 741.58it/s]


[0.08126447 0.00891771 0.05784464 0.00303013 0.04035437 0.00531961]
