## Predict Label with extracted evidence texts
This notebook builds the MLP model for RTM step according to the FNC competition paper.  

### Data preprocess

#### Load data as pandas DF

In [1]:
import json
import numpy as np
import pandas as pd

train_file_path = "./JSONFiles/" + "train_with_text.json"
use_test_file = False
if use_test_file:
    test_file_path = './JSONFiles/' + 'test_with_text.json'
else:
    test_file_path = './JSONFiles/' + 'dev_with_text.json'

with open(train_file_path, mode='r') as f:
    train = json.load(f)
with open(test_file_path, mode='r') as f:
    test = json.load(f)

def load_training_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        if len(text) == 0:
            text = "no word"

        SUP = NOINFO = REF = 0
        if record.get("label") == "SUPPORTS":
            SUP = 1
        elif record.get("label") == "REFUTES":
            REF = 1
        else:
            NOINFO = 1
        dataset_record = {
            "claim": claim,
            "evi_text": text,
            "claim_with_evi_text": claim + " ||| " + text,
            "SUP": SUP,
            "NOINFO": NOINFO,
            "REF": REF
        }
        dataset_list.append(dataset_record)
    return dataset_list

def load_test_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_index = record.get("evidence")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        if len(text) == 0:
            text = "no word"
            
        dataset_record = {
            "key": key,
            "claim": claim,
            "evidence": evi_index,
            "claim_with_evi_text": claim + " ||| " + text,
            "evi_text": text
        }
        dataset_list.append(dataset_record)
    return dataset_list

train_df = pd.DataFrame(load_training_data(train))
test_df = pd.DataFrame(load_test_data(test))

train_df[0: 10]

Unnamed: 0,NOINFO,REF,SUP,claim,claim_with_evi_text,evi_text
0,0,1,0,Ireland does not have relatively low-lying mou...,Ireland does not have relatively low-lying mou...,The island 's geography comprises relatively l...
1,0,0,1,The drama Dark Matter stars Taylor Schilling.,The drama Dark Matter stars Taylor Schilling. ...,She made her film debut in the 2007 drama Dark...
2,0,0,1,"In 1932, Prussia was taken over.","In 1932, Prussia was taken over. ||| In the We...","In the Weimar Republic , the state of Prussia ..."
3,0,0,1,IZombie premiered in 2015.,IZombie premiered in 2015. ||| The series prem...,"The series premiered on March 17 , 2015 .\n"
4,0,0,1,Ronald Reagan had a nationality.,Ronald Reagan had a nationality. ||| Ronald Wi...,"Ronald Wilson Reagan February 6 , 1911 -- June..."
5,0,0,1,Samoa Joe wrestles professionally.,Samoa Joe wrestles professionally. ||| Nuufola...,"Nuufolau Joel `` Joe '' Seanoa born March 17 ,..."
6,0,0,1,University of Oxford is in the universe.,University of Oxford is in the universe. ||| T...,The University of Oxford informally Oxford Uni...
7,1,0,0,The Renaissance began online.,The Renaissance began online. ||| Their earlie...,Their earlier musical output was often labelle...
8,0,0,1,Portia de Rossi appeared on Scandal.,Portia de Rossi appeared on Scandal. ||| She a...,She appeared as a regular cast member on the A...
9,0,1,0,The Berlin Wall was only standing for 10 years.,The Berlin Wall was only standing for 10 years...,The Berlin Wall Berliner Mauer was a guarded c...


In [2]:
test_df[0: 10]

Unnamed: 0,claim,claim_with_evi_text,evi_text,evidence,key
0,Ripon College's student number totaled in at a...,Ripon College's student number totaled in at a...,"As of 2015 , Ripon College 's student body sto...","[[Ripon_College_-LRB-Wisconsin-RRB-, 1]]",100038
1,"Kesha was baptized on March 1st, 1987.","Kesha was baptized on March 1st, 1987. ||| Kes...","Kesha Rose Sebert ; born March 1 , 1987 ; form...","[[Kesha, 0]]",100083
2,Birthday Song (2 Chainz song) was banned by So...,Birthday Song (2 Chainz song) was banned by So...,"The song , which features fellow American rapp...","[[Birthday_Song_-LRB-2_Chainz_song-RRB-, 1]]",100169
3,The University of Illinois at Chicago is a col...,The University of Illinois at Chicago is a col...,The University of Illinois at Chicago or UIC i...,"[[University_of_Illinois_at_Chicago, 0]]",100234
4,French Indochina was officially known as the I...,French Indochina was officially known as the I...,Queen Square is the first element in `` the mo...,"[[Queen_Square,_Bath, 1]]",100359
5,Damon Albarn has refused to ever work with Bri...,Damon Albarn has refused to ever work with Bri...,His debut solo studio album Everyday Robots --...,"[[Damon_Albarn, 17]]",100366
6,Lost (TV series) is a series of plays.,Lost (TV series) is a series of plays. ||| Los...,Lost is an American television drama series th...,"[[Lost_-LRB-TV_series-RRB-, 0]]",100429
7,Edison Machine Works was barely set up to prod...,Edison Machine Works was barely set up to prod...,A metabibliography or biblio-bibliography is a...,"[[Metabibliography, 0]]",100457
8,The human brain is set apart from mammalian br...,The human brain is set apart from mammalian br...,The office was replaced by the Lord Lieutenant...,"[[Lord_Lieutenant_of_Ross-shire, 1]]",100461
9,"There are rumors that Augustus' wife, Livia, p...","There are rumors that Augustus' wife, Livia, p...",Entire SH-09 between Dabok and Chittaurgarh ha...,"[[Mavli, 41]]",100481


#### Tokenization and Lemmatization

In [3]:
import nltk
nltk.download('stopwords')

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    # remove stop words
#     stop_words = nltk.corpus.stopwords.words('english')
#     words = [w for w in words if not w in stop_words]
    # return result
    processed_comment = " ".join(words)
    return processed_comment

def process_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    corpus = pd.concat([dataset['claim'], dataset['evi_text']])
    processed_corpus = corpus.apply(lambda text: pre_process(text))
    dataset['claim'] = processed_corpus.iloc[0: len(dataset)]
    dataset['evi_text'] = processed_corpus.iloc[len(dataset):,]
    return dataset

train_df = process_dataset(train_df)
test_df = process_dataset(test_df)
train_df[0: 10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,NOINFO,REF,SUP,claim,claim_with_evi_text,evi_text
0,0,1,0,moana be not a disney movie,Moana is not a Disney movie. ||| Moana -LRB- -...,moana lrb lsb moʊˈɑːnə rsb rrb be a 2016 ameri...
1,0,0,1,sweden be a country,Sweden is a country. ||| Sweden -LRB- Konungar...,sweden lrb konungariket sverige rrb be a scand...
2,1,0,0,jeff hardy be an announcer,Jeff Hardy is an announcer. ||| Ziri Hammar -L...,ziri hammar lrb زيري حم ار bear july 25 1992 i...
3,0,0,1,francois de belleforest translate the work of ...,Francois de Belleforest translated the works o...,no word
4,0,0,1,dominick dunne be involve in the film panic in...,Dominick Dunne was involved in the film Panic ...,he begin his career a a producer in film and t...
5,0,1,0,superior donut be rarely american,Superior Donuts is rarely American. ||| Superi...,superior donut be an american sitcom that air ...
6,1,0,0,the kitti s hog nose bat belong to the chiropt...,The Kitti's hog-nosed bat belongs to the Chiro...,serhiy voronin a ukrainian footballer
7,0,1,0,new horizon only fly past the smallest planet ...,New Horizons only flew past the smallest plane...,the jupiter flyby provide a gravity assist tha...
8,0,0,1,stockard channing star in grease,Stockard Channing starred in Grease. ||| She i...,she be know for play betty rizzo in the film g...
9,0,0,1,the house of lusignan reign during the middle age,The House of Lusignan reigned during the Middl...,the house of lusignan lrb lsb ˈluːzᵻnjɒn rsb r...


### Feature extraction

#### TF Features

In [3]:
train_df['claim']

0         Ireland does not have relatively low-lying mou...
1             The drama Dark Matter stars Taylor Schilling.
2                          In 1932, Prussia was taken over.
3                                IZombie premiered in 2015.
4                          Ronald Reagan had a nationality.
5                        Samoa Joe wrestles professionally.
6                  University of Oxford is in the universe.
7                             The Renaissance began online.
8                      Portia de Rossi appeared on Scandal.
9           The Berlin Wall was only standing for 10 years.
10                               Laurie Hernandez competes.
11        There are zero cities in the Northeast megalop...
12                         There is a film called Zootopia.
13                 Portia de Rossi was featured on Scandal.
14                       Sean Connery was cast in The Rock.
15        San Francisco is the location of the Hudson Br...
16        Dennis Quaid decided to act in

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix, hstack

max_features = 5000 

train_corpus = pd.concat([train_df['claim'], train_df['evi_text']])
test_corpus = pd.concat([test_df['claim'], test_df['evi_text']])

tf_vectorizer = CountVectorizer(max_features=max_features)
tf_vectorizer.fit(train_corpus)
train_claim_tf_features = tf_vectorizer.transform(train_df['claim'])
train_evi_tf_features = tf_vectorizer.transform(train_df['evi_text'])
test_claim_tf_features = tf_vectorizer.transform(test_df['claim'])
test_evi_tf_features = tf_vectorizer.transform(test_df['claim'])

train_tf_features = hstack([train_claim_tf_features, train_evi_tf_features])
test_tf_features = hstack([test_claim_tf_features, test_evi_tf_features])
# claim_tf_vectorizer = CountVectorizer(max_features=max_features)
# claim_tf = claim_tf_vectorizer.fit_transform(train_df['claim'])
# evi_text_tf_vectorizer = CountVectorizer(max_features=max_features)
# evi_text_tf = evi_text_tf_vectorizer.fit_transform(train_df['evi_text'])
# tf_features = hstack([claim_tf, evi_text_tf])

# tf_features

#### TF_IDF Cosine similarity

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

max_features = 5000

all_corpus = pd.concat([train_corpus, test_corpus])

def calculate_cosines(claim_tfidf, evi_tfidf) -> np.ndarray:
    cosines = np.zeros((claim_tfidf.shape[0], 1))
    for i in range(len(cosines)):
        claim_vector = claim_tfidf[i]
        evi_vector = evi_tfidf[i]
        cosine_matrix = cosine_similarity([claim_vector.toarray()[0], evi_vector.toarray()[0]])
        cosines[i][0] = cosine_matrix[0][1]
    return cosines

tfidf_vectorizer = TfidfVectorizer(max_features=max_features, norm='l2')
tfidf_vectorizer.fit(all_corpus)

train_claim_tfidf = tfidf_vectorizer.transform(train_df['claim'])
train_evi_tfidf = tfidf_vectorizer.transform(train_df['evi_text'])
train_cosines = calculate_cosines(train_claim_tfidf, train_evi_tfidf)

test_claim_tfidf = tfidf_vectorizer.transform(test_df['claim'])
test_evi_tfidf = tfidf_vectorizer.transform(test_df['evi_text'])
test_cosines = calculate_cosines(test_claim_tfidf, test_evi_tfidf)



#### Concat features together

In [6]:
x_train = hstack([train_tf_features, train_cosines]).toarray()
y_train = train_df[train_df.columns[0:3]].values
x_test = hstack([test_tf_features, test_cosines]).toarray()

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(145449, 10001)
(145449, 3)
(5001, 10001)


## Build and Train model
Build an MLP with tensor (10001, 1) as input, 1 hidden layer with 100 neurons, and softmax layer for output. 

### Simple MLP model prototype

In [7]:
# from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# from keras.layers import Bidirectional, GlobalMaxPool1D

# {lr=0.01, batch_size=128, dropout=0.5, units=100}
# {lr=0.001, batch_size=256, dropout=0.6, units=100}
seed = 7
np.random.seed(seed)

import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
model.add(Dropout(0.6))
model.add(Dense(units=3, activation='softmax'))
optimizer = Adam(lr=0.01)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam', metrics=['accuracy'])

model.summary()
# SVG(model_to_dot(model).create(prog='dot', format='svg'))

# callbacks
filepath="best_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
earlyStopping = EarlyStopping(monitor='val_acc', patience=1, verbose=0, mode='auto')

callbacks_list = [checkpoint, earlyStopping]

model.fit(x=x_train, y=y_train, batch_size=128, epochs=50, validation_split=0.1, callbacks=callbacks_list)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               1000200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 1,000,503
Trainable params: 1,000,503
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 130904 samples, validate on 14545 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.84428, saving model to best_weights.hdf5
Epoch 2/50



<keras.callbacks.History at 0x7fe622d96320>

### Tune hyper-parameters mannually

In [None]:
# import itertools

# class Cartesian(object):
#     def __init__(self):
#         self._data_list = []
#         self._name_list = []
#         self.cartesian_result = []

#     def add_data(self, data, name): #add list for cartesian product
#         self._data_list.append(data)
#         self._name_list.append(name)

#     def build(self): #calculate cartesian product
#         for item in itertools.product(*self._data_list):
#             result_dict = {}
#             for i in range(len(item)):
#                 result_dict.update({
#                     self._name_list[i]: item[i]
#                 })
#             self.cartesian_result.append(result_dict)
#         return self.cartesian_result


In [None]:
# # from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# # from keras.layers import Bidirectional, GlobalMaxPool1D
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from keras.utils.vis_utils import model_to_dot
# from keras.optimizers import Adam
# import keras

# # fix random seed for reproducibility
# seed = 7
# np.random.seed(seed)

# def create_model(units = 100, dropout = 0.5, lr = 0.001):
#     model = Sequential()
#     model.add(Dense(units=units, activation='relu', input_dim=x_train.shape[1]))
#     model.add(Dropout(dropout))
#     model.add(Dense(units=3, activation='softmax'))
#     optimizer = Adam(lr=lr)
#     model.compile(loss=keras.losses.categorical_crossentropy,
#                   optimizer='adam', metrics=['accuracy'])
#     return model

# def fit_model(model, batch_size=32):
#     earlyStopping = EarlyStopping(monitor='val_acc', patience=3, 
#                                   verbose=0, mode='auto')
#     callbacks_list = [earlyStopping]

#     model_history = model.fit(x=x_train, y=y_train, 
#                               batch_size=batch_size, epochs=50, 
#                               validation_split=0.1, callbacks=callbacks_list, verbose=1)
#     return model_history

# units_list = [25, 50, 100, 250, 500]
# dropout_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
# batch_size_list = [16, 32, 64, 128, 256, 512]
# lr_list = [0.0001, 0.001, 0.01, 0.1]
    
# car_product=Cartesian()
# car_product.add_data(units_list, 'units')
# car_product.add_data(dropout_list, 'dropout')
# car_product.add_data(batch_size_list, 'batch_size')
# car_product.add_data(lr_list, 'lr')
# parameter_combinations = car_product.build()

# historys_list = []
# iternum = 0
# for combination in parameter_combinations:
#     print("itertion: " + str(iternum))
#     print(combination)
#     model = create_model(units=combination['units'], 
#                          dropout=combination['dropout'], 
#                          lr=combination['lr'])
#     model_history = fit_model(model=model, batch_size=combination['batch_size'])
#     historys_list.append({
#         'combination': combination,
#         'max_val_acc': max(model_history.history['val_acc'])
#     })
#     print("result: " + str(max(model_history.history['val_acc'])))
#     iternum += 1


In [None]:
# # sort and output_to_file
# ordered_history = sorted(historys_list, key= lambda x: x['max_val_acc'], reverse=True)

# historys_list_dict = {
#     "historys": ordered_history
# }
# with open('tune_hps.json', 'w') as hp_result:
#     json.dump(ordered_history, hp_result, indent=4)
    

### Tune hyper-parameters with sklearn

In [None]:
# from sklearn.model_selection import GridSearchCV
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import train_test_split
# import keras
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot


# # # fix random seed for reproducibility
# seed = 7
# np.random.seed(seed)

# def create_model():
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
#     model.add(Dropout(0.3))
#     model.add(Dense(units=3, activation='softmax'))
#     model.compile(loss=keras.losses.categorical_crossentropy,
#                   optimizer='adam', metrics=['accuracy'])
#     return model

# model = KerasClassifier(build_fn=create_model, verbose=2)
# batch_size = [64, 128]
# # epochs = [1, 2]
# param_grid = dict(batch_size=batch_size)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=2)
# grid_result = grid.fit(X=x_train, y=y_train)


# # summarize results
# # print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# # means = grid_result.cv_results_['mean_test_score']
# # stds = grid_result.cv_results_['std_test_score']
# # params = grid_result.cv_results_['params']
# # for mean, stdev, param in zip(means, stds, params):
# #     print("%f (%f) with: %r" % (mean, stdev, param))

# grid_result.best_params_

In [None]:
# def create_model():
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
#     model.add(Dropout(0.3))
#     model.add(Dense(units=3, activation='softmax'))
#     model.compile(loss=keras.losses.categorical_crossentropy,
#                   optimizer='adam', metrics=['accuracy'], verbose=2)
#     return model

# def fit_model():
#     # callbacks
#     filepath="best_weights.hdf5"
#     checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#     earlyStopping = EarlyStopping(monitor='val_acc', patience=1, verbose=0, mode='min')

#     callbacks_list = [checkpoint, earlyStopping]

#     model.fit(x=x_train, y=y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=callbacks_list)


## Apply model

### Apply model on test data

In [8]:
model.load_weights("best_weights.hdf5")
y_test = model.predict(x_test, batch_size=128, verbose=1)
y_test



array([[7.7777557e-05, 8.1297863e-01, 1.8694350e-01],
       [1.3373666e-04, 9.9978524e-01, 8.1002356e-05],
       [3.7203248e-05, 1.3802871e-01, 8.6193407e-01],
       ...,
       [1.3517000e-04, 2.7017993e-01, 7.2968489e-01],
       [9.9983287e-01, 9.6332136e-05, 7.0762784e-05],
       [1.9827129e-01, 6.0392892e-01, 1.9779973e-01]], dtype=float32)

### Output result to file

In [9]:
result_dict = {}

for i in range(len(test_df)):
    if np.argmax(y_test[i]) == 0:
        label = "NOT ENOUGH INFO"
    elif np.argmax(y_test[i]) == 1:
        label = "REFUTES"
    else:
        label = "SUPPORTS"
    key = test_df['key'][i]
    result_dict.update({
        key:{
            "claim": test_df['claim'][i],
            "label": label,
            "evidence": test_df['evidence'][i]
        }
    })
    
with open('result_on_dev.json', 'w') as outfile:
    json.dump(result_dict, outfile, indent=4)