## Deploy BERT server

Instruction website: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html  
Download server and client:
``` bash
pip install -U bert-serving-server bert-serving-client  
```
Downlaod and unzip pretrained bert model(BERT-Large, Uncased, 1024 dimensional output):  
``` bash
cd ${model_path}
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
unzip uncased_L-24_H-1024_A-16.zip  
```  

Start bert server at local machine: 
``` bash
bert-serving-start -model_dir ${model_path}/uncased_L-24_H-1024_A-16 -max_seq_len=100 -num_worker=1  
bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1
```
Then, call from client end in python:
``` python
from bert_serving.client import BertClient
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])
```


## Load data as Pandas dataframe

In [1]:
import json
import numpy as np
import pandas as pd

train_file_path = "./JSONFiles/" + "train_with_text.json"
use_test_file = True
if use_test_file:
    test_file_path = './JSONFiles/' + 'result_test_521_tfidf080_text.json'
else:
    test_file_path = './JSONFiles/' + 'dev_with_text.json'

with open(train_file_path, mode='r') as f:
    train = json.load(f)
with open(test_file_path, mode='r') as f:
    test = json.load(f)
    
result_dict = {}

def load_training_data(dataset: dict) -> list:
    dataset_list = []
    sorted_keys = sorted(dataset.keys())
    for key in sorted_keys:
        record = dataset.get(key)
        claim = record.get("claim")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        if len(text) == 0 or text == '\n':
            text = "no word"

        SUP = NOINFO = REF = 0
        if record.get("label") == "SUPPORTS":
            SUP = 1
        elif record.get("label") == "REFUTES":
            REF = 1
        else:
            NOINFO = 1
        dataset_record = {
            "claim": claim,
            "evi_text": text,
            "claim_with_evi_text": claim + " ||| " + text,
            "SUP": SUP,
            "NOINFO": NOINFO,
            "REF": REF
        }
        dataset_list.append(dataset_record)
    return dataset_list

def load_test_data(dataset: dict) -> list:
    dataset_list = []
    sorted_keys = sorted(dataset.keys())
    for key in sorted_keys:
        record = dataset.get(key)
        claim = record.get("claim")
        evi_index = record.get("evidence")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        
        if len(evi_index) == 0 or text == '\n':
            result_dict.update({
                key:{
                    "claim": claim,
                    "label": "NOT ENOUGH INFO",
                    "evidence": []
                    }
            })
            continue
        if len(text) == 0:
            text = "no word"
            
        dataset_record = {
            "key": key,
            "claim": claim,
            "evidence": evi_index,
            "claim_with_evi_text": claim + " ||| " + text,
            "evi_text": text
        }
        dataset_list.append(dataset_record)
    return dataset_list

train_df = pd.DataFrame(load_training_data(train))
test_df = pd.DataFrame(load_test_data(test))

train_df[0: 10]

Unnamed: 0,NOINFO,REF,SUP,claim,claim_with_evi_text,evi_text
0,0,1,0,Ireland does not have relatively low-lying mou...,Ireland does not have relatively low-lying mou...,Ireland The island 's geography comprises rela...
1,0,0,1,The drama Dark Matter stars Taylor Schilling.,The drama Dark Matter stars Taylor Schilling. ...,Taylor Schilling She made her film debut in th...
2,0,0,1,"In 1932, Prussia was taken over.","In 1932, Prussia was taken over. ||| Prussia I...","Prussia In the Weimar Republic , the state of ..."
3,0,0,1,IZombie premiered in 2015.,IZombie premiered in 2015. ||| IZombie TV seri...,IZombie TV series The series premiered on Marc...
4,0,0,1,Ronald Reagan had a nationality.,Ronald Reagan had a nationality. ||| Ronald Re...,Ronald Reagan Ronald Wilson Reagan February 6 ...
5,0,0,1,Samoa Joe wrestles professionally.,Samoa Joe wrestles professionally. ||| Samoa J...,Samoa Joe Nuufolau Joel `` Joe '' Seanoa born ...
6,0,0,1,University of Oxford is in the universe.,University of Oxford is in the universe. ||| U...,University of Oxford The University of Oxford ...
7,1,0,0,The Renaissance began online.,The Renaissance began online. ||| Mulankunnath...,Mulankunnathukavu railway station Two Shornur-...
8,0,0,1,Portia de Rossi appeared on Scandal.,Portia de Rossi appeared on Scandal. ||| Porti...,Portia de Rossi She appeared as a regular cast...
9,0,1,0,The Berlin Wall was only standing for 10 years.,The Berlin Wall was only standing for 10 years...,Berlin Wall The Berlin Wall Berliner Mauer was...


In [2]:
len(test_df)

11879

## Feature extraction

### Construct and save bert features to file for reuse

In [7]:
from bert_serving.client import BertClient
bc = BertClient()

# For uncased
# restart server with 
# bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_batch_size=384 -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1
# For cased
# restart server with
# bert-serving-start -model_dir /share/ShareFolder/cased_L-24_H-1024_A-16/ -cased_tokenization -max_batch_size=384 -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1




# case = ""
case = "_cased"

In [8]:
# # train, test claim encode
# restart server with 
# bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=50 -gpu_memory_fraction=0.9 -num_worker=1


# train_claim_encode = bc.encode(list(train_df['claim']))
# np.save("./BERT_MLP_encodings/train_claim_encode" + case, train_claim_encode)

test_claim_encode = bc.encode(list(test_df['claim']))
np.save("./BERT_MLP_encodings/test_claim_encode" + case, test_claim_encode)

In [9]:
# train, test evidence encode
# restart server with 
# bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1

# train_evi_encode = bc.encode(list(train_df['evi_text']))
# np.save("./BERT_MLP_encodings/train_evi_encode" + case, train_evi_encode)

test_evi_encode = bc.encode(list(test_df['evi_text']))
np.save("./BERT_MLP_encodings/test_evi_encode" + case, test_evi_encode)



In [10]:
# # train, test claim+evidence pair encode
# restart server with 
# bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1


# train_pair_encode = bc.encode(list(train_df['claim_with_evi_text']))
# np.save("./BERT_MLP_encodings/train_pair_encode" + case, train_pair_encode)

test_pair_encode = bc.encode(list(test_df['claim_with_evi_text']))
np.save("./BERT_MLP_encodings/test_pair_encode" + case, test_pair_encode)

### Load bert features from file

In [11]:

train_claim_features = np.load("./BERT_MLP_encodings/train_claim_encode.npy")
test_claim_features = np.load("./BERT_MLP_encodings/test_claim_encode.npy")

train_evi_features = np.load("./BERT_MLP_encodings/train_evi_encode.npy")
test_evi_features = np.load("./BERT_MLP_encodings/test_evi_encode.npy")

train_pair_features = np.load("./BERT_MLP_encodings/train_pair_encode.npy")
test_pair_features = np.load("././BERT_MLP_encodings/test_pair_encode.npy")

train_claim_features_cased = np.load("./BERT_MLP_encodings/train_claim_encode_cased.npy")
test_claim_features_cased = np.load("./BERT_MLP_encodings/test_claim_encode_cased.npy")

train_evi_features_cased = np.load("./BERT_MLP_encodings/train_evi_encode_cased.npy")
test_evi_features_cased = np.load("./BERT_MLP_encodings/test_evi_encode_cased.npy")

train_pair_features_cased = np.load("./BERT_MLP_encodings/train_pair_encode_cased.npy")
test_pair_features_cased = np.load("././BERT_MLP_encodings/test_pair_encode_cased.npy")


In [12]:
x_train = np.concatenate([train_claim_features, train_evi_features, train_pair_features, train_claim_features_cased, train_evi_features_cased, train_pair_features_cased], axis=1)
y_train = train_df[train_df.columns[0:3]].values
x_test = np.concatenate([test_claim_features, test_evi_features, test_pair_features, test_claim_features_cased, test_evi_features_cased, test_pair_features_cased], axis=1)

In [13]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(145449, 6144)
(145449, 3)
(11879, 6144)


## Build and train model

### Simple MLP model prototype


In [14]:
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam

seed = 7
np.random.seed(seed)


model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(units=50, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(units=3, activation='softmax'))
# optimizer = Adam(lr=0.01)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam', metrics=['accuracy'])

model.summary()
# SVG(model_to_dot(model).create(prog='dot', format='svg'))

# callbacks
filepath="best_weights_head.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
earlyStopping = EarlyStopping(monitor='val_acc', patience=3, verbose=0, mode='auto')

callbacks_list = [checkpoint, earlyStopping]

# model.fit(x=x_train, y=y_train, batch_size=32, epochs=50, validation_split=0.1, callbacks=callbacks_list)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               1229000   
_________________________________________________________________
dense_2 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 153       
Total params: 1,239,203
Trainable params: 1,239,203
Non-trainable params: 0
_________________________________________________________________


### Tune Hyperparameters mannually

In [None]:
# import itertools

# class Cartesian(object):
#     def __init__(self):
#         self._data_list = []
#         self._name_list = []
#         self.cartesian_result = []

#     def add_data(self, data, name): #add list for cartesian product
#         self._data_list.append(data)
#         self._name_list.append(name)

#     def build(self): #calculate cartesian product
#         for item in itertools.product(*self._data_list):
#             result_dict = {}
#             for i in range(len(item)):
#                 result_dict.update({
#                     self._name_list[i]: item[i]
#                 })
#             self.cartesian_result.append(result_dict)
#         return self.cartesian_result


In [None]:
# # from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# # from keras.layers import Bidirectional, GlobalMaxPool1D
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from keras.utils.vis_utils import model_to_dot
# from keras.optimizers import Adam
# from keras.regularizers import l2
# import keras

# # fix random seed for reproducibility
# seed = 7
# np.random.seed(seed)

# def create_model(first_units = 200, second_units = 50, dropout = 0.5, decay = 0.01):
#     model = Sequential()
#     model.add(Dense(units=first_units, activation='relu',
#                     kernel_regularizer=l2(decay),
#                     input_dim=x_train.shape[1]))
#     model.add(Dropout(dropout))
#     model.add(Dense(units=second_units, activation='relu', 
#                     kernel_regularizer=l2(decay),
#                     input_dim=x_train.shape[1]))
#     model.add(Dropout(dropout))
#     model.add(Dense(units=3, activation='softmax'))
# #     optimizer = Adam(lr=0.01)
#     model.compile(loss=keras.losses.categorical_crossentropy,
#                   optimizer='adam', metrics=['accuracy'])
#     return model

# def fit_model(model, batch_size=32):
#     earlyStopping = EarlyStopping(monitor='val_acc', patience=2, 
#                                   verbose=0, mode='auto')
#     callbacks_list = [earlyStopping]

#     model_history = model.fit(x=x_train, y=y_train, 
#                               batch_size=batch_size, epochs=50, 
#                               validation_split=0.15, callbacks=callbacks_list, verbose=0)
#     return model_history

# first_units_list = [100, 250, 300, 500]
# second_units_list = [50, 100, 150, 200]
# dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
# decay_list = [0.00001, 0.0001, 0.001]
    
# car_product=Cartesian()
# car_product.add_data(first_units_list, 'first_units')
# car_product.add_data(second_units_list, 'second_units')
# car_product.add_data(dropout_list, 'dropout')
# car_product.add_data(decay_list, 'decay')
# parameter_combinations = car_product.build()

# historys_list = []
# print("total iteration: " + str(len(parameter_combinations)))
# iternum = 0
# for combination in parameter_combinations:
#     print("itertion: " + str(iternum))
#     print(combination)
#     model = create_model(first_units = combination['first_units'], 
#                          second_units = combination['second_units'], 
#                          dropout = combination['dropout'], 
#                          decay = combination['decay'])
#     model_history = fit_model(model=model, batch_size=32)
    
#     historys_list.append({
#         'combination': combination,
#         'max_val_acc': max(model_history.history['val_acc'])
#     })
#     print("result: " + str(max(model_history.history['val_acc'])))
#     iternum += 1


In [None]:
# # sort and output_to_file
# ordered_history = sorted(historys_list, key= lambda x: x['max_val_acc'], reverse=True)

# historys_list_dict = {
#     "historys": ordered_history
# }
# with open('tune_hps.json', 'w') as hp_result:
#     json.dump(ordered_history, hp_result, indent=4)
    

## Apply Model

In [15]:
# load from file
model.load_weights("best_weights_head.hdf5")
y_test = model.predict(x_test, batch_size=128, verbose=1)
y_test


for i in range(len(test_df)):
    if np.argmax(y_test[i]) == 0:
        label = "NOT ENOUGH INFO"
#         test_df['evidence'][i] = []
    elif np.argmax(y_test[i]) == 1:
        label = "REFUTES"
    else:
        label = "SUPPORTS"
    key = test_df['key'][i]
    result_dict.update({
        key:{
            "claim": test_df['claim'][i],
            "label": label,
            "evidence": test_df['evidence'][i]
        }
    })
    
with open('result_on_dev.json', 'w') as outfile:
    json.dump(result_dict, outfile, indent=4)

