In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from sklearn.metrics import f1_score

In [2]:
train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')
val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')
test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_preprocessed,question2_preprocessed
0,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,how do i play pok mon go in korea ?,how do i play pok mon go in china ?
1,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...
2,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,which is more advisable and better material fo...,what is the best server setup for buddypress ?
3,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,how do i improve logical programming skills ?,how can i improve my logical skills for progra...
4,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,how close we are to see 3rd world war ?,how close is a world war iii ?


In [4]:
def buildVocabulary(reviews):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')
    tokenizer.fit_on_texts(reviews)
    return tokenizer

def getSequences(reviews, tokenizer, seq_maxlen):
    reviews_seq = tokenizer.texts_to_sequences(reviews)
    return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))

def loadGloveWordEmbeddings():
    embedding_vectors = {}
    with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:
        for line in tqdm(f):
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_vectors[word] = coefs
    return embedding_vectors

def getEmbeddingWeightMatrix(embedding_vectors, word2idx):    
    embedding_matrix = np.zeros((len(word2idx)+1, 300))
    for word, i in tqdm(word2idx.items()):
        embedding_vector = embedding_vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [5]:
tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

x_train1 = getSequences(train['question1'], tokenizer, 128)
x_train2 = getSequences(train['question2'], tokenizer, 128)
y_train = tf.keras.utils.to_categorical(train['is_duplicate'])

x_val1 = getSequences(val['question1'], tokenizer, 128)
x_val2 = getSequences(val['question2'], tokenizer, 128)
y_val = tf.keras.utils.to_categorical(val['is_duplicate'])

x_test1 = getSequences(test['question1'], tokenizer, 128)
x_test2 = getSequences(test['question2'], tokenizer, 128)
y_test = tf.keras.utils.to_categorical(test['is_duplicate'])

119558


In [6]:
embedding_vectors = loadGloveWordEmbeddings()
print(len(embedding_vectors))

embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)
print(embedding_weight_matrix.shape)

2196018it [04:13, 8673.48it/s]


2196017


100%|██████████| 119557/119557 [00:00<00:00, 289620.78it/s]

(119558, 300)





In [8]:
inp1 = tf.keras.Input(shape=(x_train1.shape[1],))
inp2 = tf.keras.Input(shape=(x_train2.shape[1],))

inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, 
                                  weights=[embedding_weight_matrix], trainable=False)(inp1)
inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,
                                  weights=[embedding_weight_matrix], trainable=False)(inp2)

inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)

out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', dropout=0.1, return_sequences=True))(inner)

out = tf.keras.backend.mean(out, axis=1, keepdims=False)

output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)

model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)

2021-12-05 07:28:19.052944: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.
2021-12-05 07:28:19.259332: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.


In [9]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 128, 300)     35867400    input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 128, 300)     35867400    input_4[0][0]                    
______________________________________________________________________________________________

In [15]:
checkpoint_filepath  = 'weights.best.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                                verbose = 1, 
                                                                monitor = 'val_loss',
                                                                save_best_only = True)

history = model.fit((x_train1, x_train2), y_train,
                    batch_size = 32,
                    validation_data = ((x_val1, x_val2), y_val),
                    validation_batch_size = 16,
                    epochs=5,
                    callbacks=[model_checkpoint_callback], 
                    verbose=1)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.58903, saving model to weights.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.58903 to 0.57661, saving model to weights.best.hdf5


In [18]:
loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)

print('loss on test data is', loss)
print('accuracy on test data is', accuracy)

loss on test data is 0.5730125308036804
accuracy on test data is 0.7340275645256042


In [19]:
pred = model.predict((x_test1, x_test2))

print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))

f1_score on test dataset is 0.6310516383599245
