# Train sentence by BERT features

## Deploy BERT server

Instruction website: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html  
Download server and client:
``` bash
pip install -U bert-serving-server bert-serving-client  
```
Downlaod and unzip pretrained bert model(BERT-Large, Uncased, 1024 dimensional output):  
``` bash
cd ${model_path}
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
unzip uncased_L-24_H-1024_A-16.zip  
```  

Start bert server at local machine: 
``` bash
bert-serving-start -model_dir ${model_path}/uncased_L-24_H-1024_A-16 -max_seq_len=100 -num_worker=1  
bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1
```
Then, call from client end in python:
``` python
from bert_serving.client import BertClient
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])
```


## Load data as Pandas dataframe

In [4]:
import numpy as np
import pandas as pd

train_file_path = "./training.csv"
test_file_path = './dev.csv'


train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)


train_df.head(5)

Unnamed: 0,claim,claim_evi_pair,evidence,label
0,Cerebral palsy has been documented throughout ...,Cerebral palsy has been documented throughout ...,Cerebral palsy CP is the most common movement ...,0
1,Peter Cetera was a vocalist.,Peter Cetera was a vocalist. ||| Peter Cetera ...,"Peter Cetera With `` If You Leave Me Now , '' ...",1
2,Judy Greer is an American.,Judy Greer is an American. ||| Judy Greer Judy...,Judy Greer Judy Greer born Judith Therese Evan...,1
3,Julie Christie was nominated for an Oscar for ...,Julie Christie was nominated for an Oscar for ...,Julie Christie She came to international atten...,0
4,Wikipedia has zero articles.,Wikipedia has zero articles. ||| Wikipedia Ove...,"Wikipedia Overall , Wikipedia consists of more...",1


In [5]:
len(test_df)

11982

## Feature extraction

### Construct and save bert features to file for reuse

In [None]:
# from bert_serving.client import BertClient
# bc = BertClient()

# for cased restart server with 
# bert-serving-start -model_dir /share/ShareFolder/cased_L-24_H-1024_A-16/ -cased_tokenization -max_batch_size=1024 -gpu_memory_fraction=0.9 -num_worker=1
# for uncased restart server with 
# bert-serving-start -model_dir /share/ShareFolder/cased_L-24_H-1024_A-16/ -max_batch_size=1024 -gpu_memory_fraction=0.9 -num_worker=1

# case = "_cased"
# case = ""


In [None]:
# # train, test claim encode
# restart server with 
# bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -gpu_memory_fraction=0.9 -num_worker=1


# train_claim_encode = bc.encode(list(train_df['claim']))
# np.save("./Sentence_encodings/train_claim_encode" + case, train_claim_encode)

# test_claim_encode = bc.encode(list(test_df['claim']))
# np.save("./Sentence_encodings/test_claim_encode" + case, test_claim_encode)

In [None]:
# train, test evidence encode
# restart server with 

# train_evi_encode = bc.encode(list(train_df['evidence']))
# np.save("./Sentence_encodings/train_evi_encode" + case, train_evi_encode)

# test_evi_encode = bc.encode(list(test_df['evidence']))
# np.save("./Sentence_encodings/test_evi_encode" + case, test_evi_encode)



In [None]:
# # train, test claim+evidence pair encode
# restart server with 


# train_pair_encode = bc.encode(list(train_df['claim_evi_pair']))
# np.save("./Sentence_encodings/train_pair_encode" + case, train_pair_encode)

# test_pair_encode = bc.encode(list(test_df['claim_evi_pair']))
# np.save("./Sentence_encodings/test_pair_encode" + case, test_pair_encode)

### Load bert features from file

In [6]:

# train_claim_features = np.load("./Sentence_encodings/train_claim_encode.npy")
# # test_claim_features = np.load("./Sentence_encodings/test_claim_encode.npy")

# train_evi_features = np.load("./Sentence_encodings/train_evi_encode.npy")
# # test_evi_features = np.load("./Sentence_encodings/test_evi_encode.npy")

# train_pair_features = np.load("./Sentence_encodings/train_pair_encode.npy")
# # test_pair_features = np.load("././Sentence_encodings/test_pair_encode.npy")

train_claim_features_cased = np.load("./Sentence_encodings/train_claim_encode_cased.npy")
test_claim_features_cased = np.load("./Sentence_encodings/test_claim_encode_cased.npy")

train_evi_features_cased = np.load("./Sentence_encodings/train_evi_encode_cased.npy")
test_evi_features_cased = np.load("./Sentence_encodings/test_evi_encode_cased.npy")

train_pair_features_cased = np.load("./Sentence_encodings/train_pair_encode_cased.npy")
test_pair_features_cased = np.load("././Sentence_encodings/test_pair_encode_cased.npy")


In [7]:
# x_train = np.concatenate([train_claim_features, train_evi_features, train_pair_features, train_claim_features_cased, train_evi_features_cased, train_pair_features_cased], axis=1)
x_train = np.concatenate([train_claim_features_cased, train_evi_features_cased, train_pair_features_cased], axis=1)
y_train = train_df['label'].values

# x_test = np.concatenate([test_claim_features, test_evi_features, test_pair_features, test_claim_features_cased, test_evi_features_cased, test_pair_features_cased], axis=1
x_test = np.concatenate([test_claim_features_cased, test_evi_features_cased, test_pair_features_cased], axis=1)
y_test = test_df['label'].values

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(484632, 3072)
(484632,)
(11982, 3072)
(11982,)


## Simple MLP to train (with bert features)

In [14]:
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam

seed = 7
np.random.seed(seed)


model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(units=50, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(units=1, activation='sigmoid'))
# optimizer = Adam(lr=0.01)
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer='adam', metrics=['accuracy'])

model.summary()
# SVG(model_to_dot(model).create(prog='dot', format='svg'))

# callbacks
filepath="best_weights_sentence.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
earlyStopping = EarlyStopping(monitor='val_acc', patience=0, verbose=0, mode='auto')

callbacks_list = [checkpoint, earlyStopping]

model.fit(x=x_train, y=y_train, batch_size=32, epochs=50, validation_split=0.15, callbacks=callbacks_list)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 200)               614600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 51        
Total params: 624,701
Trainable params: 624,701
Non-trainable params: 0
_________________________________________________________________
Train on 411937 samples, validate on 72695 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.80265, saving model to best_

<keras.callbacks.History at 0x7fc2fcdb3a90>

## Apply model to test set

In [29]:
model.load_weights("best_weights_sentence.hdf5")
y_test_predict = model.predict(x_test, batch_size=128, verbose=1)


correct = 0.0
total = len(y_test)

for i in range(len(y_test)):
    if y_test[i] == 1 and y_test_predict[i][0] > 0.5:
        correct += 1
    if y_test[i] == 0 and y_test_predict[i][0] <= 0.5:
        correct += 1
accuracy = correct / total
print("accoracy on test set: " + str(accuracy))

accoracy on test set: 0.799783007845101
