In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install bert-tensorflow==1.0.1
from bert import tokenization

In [None]:
!pip install tokenization
!pip install transformers
!pip install sentencepiece
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/tools/tokenization.py

In [31]:
import numpy as np
import pandas as pd
import tensorflow as tf
%tensorflow_version 2.x
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization


In [None]:
import os
os.chdir('/content/drive/My Drive/課程/AICap/final/NYCU_AICap_FinalProject') 
os.listdir()

In [34]:
train = pd.read_csv("train_clean.csv")
train.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target
0,0,1,,,Our Deeds Reason # earthquake May ALLAH Forgive u,1.0
1,1,4,,,Forest fire near La Ronge Sask Canada,1.0
2,2,5,,,All resident asked shelter place notified offi...,1.0
3,3,6,,,people receive # wildfire evacuation order Cal...,1.0
4,4,7,,,Just got sent photo Ruby # Alaska smoke # wild...,1.0


In [35]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()


In [36]:
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [37]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [38]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [13]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [14]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 160)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 160)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 160)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 160, 768)]                'input_mask[0][0]',         

  super(Adam, self).__init__(name, **kwargs)


In [27]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=4,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [39]:
def build_model2(bert_layer, max_len=512):
  input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
  input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
  segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

  _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = sequence_output[:, 0, :]
  dense_layer1 = Dense(units=256,activation='relu')(clf_output)
  dense_layer1 = Dropout(0.4)(dense_layer1)
  dense_layer2 = Dense(units=128, activation='relu')(dense_layer1)
  dense_layer2 = Dropout(0.4)(dense_layer2)
  out = Dense(1, activation='sigmoid')(dense_layer2)
  
  model = Model(inputs=[input_word_ids, input_mask, segment_ids],outputs=out)
  model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
  
  return model

In [40]:
model2 = build_model2(bert_layer, max_len=160)
model2.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 160)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 160)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 160)]        0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 160, 768)]                'input_mask[0][0]',       

  super(Adam, self).__init__(name, **kwargs)


In [41]:
checkpoint = ModelCheckpoint('model2.h5', monitor='val_loss', save_best_only=True)

train_history = model2.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=7,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [42]:
model2.load_weights('model2.h5')
test = pd.read_csv("test_clean.csv")
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
submission = pd.read_csv("sample_submission.csv")
test_pred = model2.predict(test_input)

submission['target'] = test_pred.round().astype(int)
submission.to_csv('BERT_submission.csv', index=False)

Score: 0.82378