In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
sys.path.insert(0, "../input/tokenization/")
import tensorflow_hub as hub
import tensorflow as tf
from tokenization import *
import tensorflow.keras.backend as K
import gc
import os
import random
from numpy.random import seed
from scipy.stats import spearmanr
from math import floor, ceil
seed(42)
tf.random.set_seed(42)
random.seed(42)

os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = "true"

np.set_printoptions(suppress=True)

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tokenization/tokenization.py
/kaggle/input/glbertbaseuncased/google-bert-uncased-base/saved_model.pb
/kaggle/input/glbertbaseuncased/google-bert-uncased-base/assets/vocab.txt
/kaggle/input/glbertbaseuncased/google-bert-uncased-base/variables/variables.data-00000-of-00001
/kaggle/input/glbertbaseuncased/google-bert-uncased-base/variables/variables.index
/kaggle/input/google-quest-challenge/test.csv
/kaggle/input/google-quest-challenge/train.csv
/kaggle/input/google-quest-challenge/sample_submission.csv
/kaggle/input/bertbaseuncased/bert-base-uncased/config.json
/kaggle/input/bertbaseuncased/bert-base-uncased/vocab.txt
/kaggle/input/bertbaseuncased/bert-base-uncased/pytorch_model.bin


In [3]:
data_dir = '../input/google-quest-challenge/'
df_train = pd.read_csv("../input/google-quest-challenge/train.csv")
df_test = pd.read_csv("../input/google-quest-challenge/test.csv")

In [4]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/glbertbaseuncased/google-bert-uncased-base'
tokenizer = FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

In [5]:
output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])

In [6]:
#df_train['all_concat'] = df_train['question_title'] + df_train['question_body'] + df_train['answer']
#df_test['all_concat'] = df_test['question_title'] + df_test['question_body'] + df_test['answer']

In [7]:
def _get_masks(tokens, max_seq_length):
    
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):

    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _trim_input(sentence1, max_sequence_length, sentence2 = None):

    if sentence2 is not None:
        q = tokenizer.tokenize(sentence1)
        a = tokenizer.tokenize(sentence2)
        q_len = len(q)
        a_len = len(a)
        
        if (q_len+a_len+3) > max_sequence_length:
        
            new_q_len = q_len/(a_len+q_len) * (max_sequence_length-3)
            new_a_len = a_len/(q_len+a_len) * (max_sequence_length-3)
            new_q_len, new_a_len = int(ceil(new_q_len)), int(floor(new_a_len))

            if new_a_len+new_q_len+3 != max_sequence_length:
                raise ValueError("too small %s" % str(new_a_len+new_q_len+3))

            q = q[:new_q_len]
            a = a[:new_a_len]
        
        return q,a
    
    else:
        q = tokenizer.tokenize(sentence1)
        q_len = len(q)
        
        if (q_len+3) > max_sequence_length:
        
            new_q_len = q_len/(q_len) * (max_sequence_length-3)
            new_q_len = int(ceil(new_q_len))

            if new_q_len+3 != max_sequence_length:
                raise ValueError("too small %s" % str(new_q_len+3))

            q = q[:new_q_len]   
        
        return q

def _convert_to_bert_inputs(sentence1, tokenizer, max_sequence_length, sentence2 = None):
    
    if sentence2 is None:
        stoken = ["[CLS]"] + sentence1 + ["[SEP]"]
    else:
        stoken = ["[CLS]"] + sentence1 + ["[SEP]"] + sentence2 + ["[SEP]"]
        

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        
        if len(columns) > 1:
            q, a = instance[columns[0]], instance[columns[1]]

            q, a = _trim_input(q, max_sequence_length, a)

            ids, masks, segments = _convert_to_bert_inputs(q, tokenizer, max_sequence_length, a)

        else:
            q = instance[columns[0]]

            q = _trim_input(q, max_sequence_length)

            ids, masks, segments = _convert_to_bert_inputs(q, tokenizer, max_sequence_length)
            
        
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [8]:
def bert_model():
    
    input_word_ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    pooled_output, _ = bert_layer([input_word_ids, input_masks, input_segments])
    
    #x = tf.keras.layers.Dropout(0.3)(pooled_output)
    x = tf.keras.layers.Dense(512, activation='elu', name='dense_penultimate')(pooled_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(21, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model

In [9]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)

class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, fold=None):
        
        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_epoch_end(self, epoch, logs={}):
        
        self.valid_predictions = self.model.predict(self.valid_inputs)
        
        rho_val = compute_spearmanr(self.valid_outputs, self.valid_predictions)
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        #if self.fold is not None:
        #    self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')

In [10]:
def train_and_predict(model, train_data, valid_data, test_data, learning_rate, epochs, batch_size, loss_function):
    
    custom_callback = CustomCallback(valid_data=(valid_data[0], valid_data[1]), fold=None)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, batch_size=batch_size, verbose = 1
        ,validation_data=valid_data
        ,callbacks = [custom_callback])
    
    preds = model.predict(test_data)
    
    return preds

In [11]:
# def train_and_predict(model, train_data, valid_data, test_data, 
#                       learning_rate, epochs, batch_size, loss_function, fold):
        
#     custom_callback = CustomCallback(
#         valid_data=(valid_data[0], valid_data[1]), 
#         test_data=test_data,
#         batch_size=batch_size,
#         fold=None)

#     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     model.compile(loss=loss_function, optimizer=optimizer)
#     model.fit(train_data[0], train_data[1], epochs=epochs, batch_size=batch_size, callbacks=[custom_callback])
    
#     return custom_callback

#### For Single-Fold Run

In [12]:
question_categories = [x for x in output_categories if 'question' in x]
answer_categories = [x for x in output_categories if 'answer' in x][2:]

In [13]:
# ['question_title', 'question_body', 'answer']
output_categories = question_categories
input_categories = ['question_title', 'question_body']
MAX_SEQUENCE_LENGTH = 512
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

from sklearn.model_selection import train_test_split
idx = [x for x in range(6079)]
train_idx, valid_idx = train_test_split(idx, test_size=0.01, random_state=42)

train_inputs = [inputs[i][train_idx] for i in range(3)]
train_outputs = outputs[train_idx]

valid_inputs = [inputs[i][valid_idx] for i in range(3)]
valid_outputs = outputs[valid_idx]

train_data=(train_inputs, train_outputs) 
valid_data=(valid_inputs, valid_outputs)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
learning_rate=1e-4
epochs=3
batch_size=16
loss_function='binary_crossentropy'

In [15]:
model = bert_model()
preds_question = train_and_predict(model, train_data, valid_data, test_inputs, learning_rate, epochs, batch_size, loss_function)

Train on 6018 samples, validate on 61 samples
Epoch 1/3
validation rho: nan


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Epoch 2/3
validation rho: nan
Epoch 3/3
validation rho: nan


In [16]:
import gc
gc.collect()

1741

In [17]:
tf.keras.backend.clear_session()

In [18]:
def bert_model2():
    
    input_word_ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    pooled_output, _ = bert_layer([input_word_ids, input_masks, input_segments])
    
    #x = tf.keras.layers.Dropout(0.3)(pooled_output)
    x = tf.keras.layers.Dense(256, activation='elu', name='dense_penultimate')(pooled_output)
    x = tf.keras.layers.Dropout(0.3)(x)
    out = tf.keras.layers.Dense(9, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model

In [19]:
learning_rate=3e-5
epochs=3
batch_size=16
loss_function='binary_crossentropy'

In [20]:
# ['question_title', 'question_body', 'answer']
output_categories = answer_categories
input_categories = ['question_title','answer']
MAX_SEQUENCE_LENGTH = 512
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

from sklearn.model_selection import train_test_split
idx = [x for x in range(6079)]
train_idx, valid_idx = train_test_split(idx, test_size=0.01, random_state=42)

train_inputs = [inputs[i][train_idx] for i in range(3)]
train_outputs = outputs[train_idx]

valid_inputs = [inputs[i][valid_idx] for i in range(3)]
valid_outputs = outputs[valid_idx]

train_data=(train_inputs, train_outputs) 
valid_data=(valid_inputs, valid_outputs)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [21]:
del model

In [22]:
model = bert_model2()
preds_answer = train_and_predict(model, train_data, valid_data, test_inputs, learning_rate, epochs, batch_size, loss_function)

Train on 6018 samples, validate on 61 samples
Epoch 1/3
validation rho: 0.2740
Epoch 2/3
validation rho: 0.3010
Epoch 3/3
validation rho: 0.3017


In [23]:
test_preds = np.concatenate((preds_question, preds_answer), axis = 1)

In [24]:
targets = list(df_train.columns[11:])

In [25]:
submission = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')
submission[targets] = test_preds
submission.to_csv("submission.csv", index = False)
submission.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.924961,0.666626,0.244689,0.533134,0.582409,0.471319,0.670844,0.621481,0.506177,...,0.910106,0.961879,0.566135,0.969402,0.974803,0.868361,0.01371,0.014564,0.874833,0.925481
1,46,0.87673,0.479127,0.009617,0.744859,0.820798,0.930556,0.582065,0.475116,0.301757,...,0.696496,0.932846,0.652377,0.952638,0.968054,0.836203,0.923212,0.168427,0.036445,0.858581
2,70,0.904051,0.701361,0.066174,0.590609,0.83779,0.737779,0.629417,0.534657,0.23919,...,0.898858,0.932285,0.530423,0.961226,0.950417,0.772442,0.012336,0.019747,0.961413,0.884236
3,132,0.887346,0.56027,0.011126,0.764281,0.800431,0.916522,0.534746,0.454366,0.067256,...,0.743192,0.95804,0.683559,0.968864,0.983751,0.883219,0.69605,0.132747,0.904441,0.914794
4,200,0.918163,0.502562,0.018738,0.835851,0.832783,0.901249,0.612648,0.578329,0.058014,...,0.709314,0.90314,0.653497,0.952898,0.962052,0.807547,0.341162,0.106553,0.54913,0.891941


#### For K-Fold Run

In [26]:
#gkf = GroupKFold(n_splits=4).split(X=df_train.question_body, groups=df_train.question_body)

In [27]:
# histories = []
# for fold, (train_idx, valid_idx) in enumerate(gkf):
    
#     K.clear_session()
#     model = bert_model()
    
#     train_inputs = [inputs[i][train_idx] for i in range(3)]
#     train_outputs = outputs[train_idx]
    
#     valid_inputs = [inputs[i][valid_idx] for i in range(3)]
#     valid_outputs = outputs[valid_idx]
    
#     # history contains two lists of valid and test preds respectively:
#     #  [valid_predictions_{fold}, test_predictions_{fold}]
#     history = train_and_predict(model, 
#                                   train_data=(train_inputs, train_outputs), 
#                                   valid_data=(valid_inputs, valid_outputs),
#                                   test_data=test_inputs, 
#                                   learning_rate=3e-5, epochs=4, batch_size=10,
#                                   loss_function='binary_crossentropy', fold=fold)
    
#     histories.append(history)