In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from sklearn.metrics import f1_score

In [2]:
train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')
val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')
test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_preprocessed,question2_preprocessed
0,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,how do i play pok mon go in korea ?,how do i play pok mon go in china ?
1,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...
2,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0,which is more advisable and better material fo...,what is the best server setup for buddypress ?
3,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1,how do i improve logical programming skills ?,how can i improve my logical skills for progra...
4,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1,how close we are to see 3rd world war ?,how close is a world war iii ?


In [4]:
def buildVocabulary(reviews):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')
    tokenizer.fit_on_texts(reviews)
    return tokenizer

def getSequences(reviews, tokenizer, seq_maxlen):
    reviews_seq = tokenizer.texts_to_sequences(reviews)
    return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))

def loadGloveWordEmbeddings():
    embedding_vectors = {}
    with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:
        for line in tqdm(f):
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_vectors[word] = coefs
    return embedding_vectors

def getEmbeddingWeightMatrix(embedding_vectors, word2idx):    
    embedding_matrix = np.zeros((len(word2idx)+1, 300))
    for word, i in tqdm(word2idx.items()):
        embedding_vector = embedding_vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [5]:
tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

x_train1 = getSequences(train['question1'], tokenizer, 128)
x_train2 = getSequences(train['question2'], tokenizer, 128)
y_train = tf.keras.utils.to_categorical(train['is_duplicate'])

x_val1 = getSequences(val['question1'], tokenizer, 128)
x_val2 = getSequences(val['question2'], tokenizer, 128)
y_val = tf.keras.utils.to_categorical(val['is_duplicate'])

x_test1 = getSequences(test['question1'], tokenizer, 128)
x_test2 = getSequences(test['question2'], tokenizer, 128)
y_test = tf.keras.utils.to_categorical(test['is_duplicate'])

119558


In [6]:
embedding_vectors = loadGloveWordEmbeddings()
print(len(embedding_vectors))

embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)
print(embedding_weight_matrix.shape)

2196018it [04:42, 7761.81it/s]


2196017


100%|██████████| 119557/119557 [00:00<00:00, 290507.36it/s]

(119558, 300)





In [7]:
MAX_SEQUENCE_LENGTH = 128
WORD_EMBEDDING_DIM = 300
SENT_EMBEDDING_DIM = 128
DROPOUT = 0.1

In [8]:
question1 = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = tf.keras.layers.Embedding(119558, 
                 WORD_EMBEDDING_DIM, 
                 weights=[embedding_weight_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1, h1, c1 = tf.keras.layers.LSTM(200, return_sequences=True, return_state = True)(q1)
#q1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SENT_EMBEDDING_DIM, return_sequences=True), merge_mode="sum")(q1)

q2 = tf.keras.layers.Embedding(119558, 
                 WORD_EMBEDDING_DIM, 
                 weights=[embedding_weight_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2, h2, c2 = tf.keras.layers.LSTM(200, return_sequences=True, return_state = True)(q2)
#q2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SENT_EMBEDDING_DIM, return_sequences=True), merge_mode="sum")(q2)

attention = tf.keras.layers.Dot(axes = (2, 2))([q1,q2])
#attention = tf.keras.layers.Flatten()(attention)
attention_u = tf.keras.layers.Softmax(axis = -1)(attention)
attention_v = tf.keras.layers.Softmax(axis = 1)(attention)
#attention = tf.keras.layers.Dense(MAX_SEQUENCE_LENGTH*SENT_EMBEDDING_DIM, activation = "softmax")(attention)
#attention = tf.keras.layers.Dense((MAX_SEQUENCE_LENGTH*SENT_EMBEDDING_DIM))(attention)
#attention = tf.keras.layers.Reshape((MAX_SEQUENCE_LENGTH, SENT_EMBEDDING_DIM))(attention)

print(h1.shape, q1.shape, c1.shape, attention.shape)
u_bar = tf.keras.layers.Dot(axes = (2, 1))([attention_u, q2])
#u_bar = tf.keras.layers.Flatten()(u_bar)
v_bar = tf.keras.layers.Dot(axes = (2, 1))([attention_v, q1])
print(u_bar.shape, v_bar.shape)
#v_bar = tf.keras.layers.Flatten()(v_bar)
wu = tf.keras.layers.Dense(SENT_EMBEDDING_DIM)(u_bar[:, -1, :])
wv = tf.keras.layers.Dense(SENT_EMBEDDING_DIM)(v_bar[:, -1, :])
vu = tf.keras.layers.Dense(SENT_EMBEDDING_DIM)(q1[:, -1, :])
vv = tf.keras.layers.Dense(SENT_EMBEDDING_DIM)(q2[:, -1, :])
u_ = tf.keras.layers.Add()([wu, vu])
v_ = tf.keras.layers.Add()([wv, vv])
print(u_.shape, v_.shape)
u_star = tf.keras.layers.Dense(SENT_EMBEDDING_DIM, activation = "tanh")(u_)
v_star = tf.keras.layers.Dense(SENT_EMBEDDING_DIM, activation = "tanh")(v_)
print(u_star.shape, v_star.shape)
merged = tf.keras.layers.Concatenate()([u_star, v_star])
merged = tf.keras.layers.Dense(200, activation='relu')(merged)
merged = tf.keras.layers.Dropout(DROPOUT)(merged)
#merged = tf.keras.layers.BatchNormalization()(merged)

is_duplicate = tf.keras.layers.Dense(2, activation='softmax')(merged)

model = tf.keras.Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2021-11-29 05:49:31.554339: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 05:49:31.666119: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 05:49:31.666880: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-29 05:49:31.668042: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

(None, 200) (None, 128, 200) (None, 200) (None, 128, 128)
(None, 128, 200) (None, 128, 200)
(None, 128) (None, 128)
(None, 128) (None, 128)


Rough:

merged = tf.keras.layers.Dense(200, activation='relu')(merged)
merged = tf.keras.layers.Dropout(DROPOUT)(merged)
merged = tf.keras.layers.BatchNormalization()(merged)
merged = tf.keras.layers.Dense(200, activation='relu')(merged)
merged = tf.keras.layers.Dropout(DROPOUT)(merged)
merged = tf.keras.layers.BatchNormalization()(merged)
merged = tf.keras.layers.Dense(200, activation='relu')(merged)
merged = tf.keras.layers.Dropout(DROPOUT)(merged)
merged = tf.keras.layers.BatchNormalization()(merged)

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 128, 300)     35867400    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 128, 300)     35867400    input_2[0][0]                    
______________________________________________________________________________________________

In [10]:
checkpoint_filepath  = 'weights.best.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                                verbose = 1, 
                                                                monitor = 'val_loss',
                                                                save_best_only = True)
history = model.fit((x_train1, x_train2), y_train,
                    batch_size = 8,
                    validation_data = ((x_val1, x_val2), y_val),
                    validation_batch_size = 4,
                    epochs=10,
                    callbacks=[model_checkpoint_callback], 
                    verbose=1)

2021-11-29 05:49:39.406732: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.
2021-11-29 05:49:39.510896: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.
2021-11-29 05:49:39.683086: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2021-11-29 05:49:43.585746: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005




2021-11-29 06:00:28.723292: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.



Epoch 00001: val_loss improved from inf to 0.49630, saving model to weights.best.hdf5




Epoch 2/10

Epoch 00002: val_loss improved from 0.49630 to 0.46914, saving model to weights.best.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.46914
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.46914
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.46914
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.46914
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.46914
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.46914
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.46914
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.46914


In [13]:
loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)

print('loss on test data is', loss)
print('accuracy on test data is', accuracy)

loss on test data is 0.4953326880931854
accuracy on test data is 0.7771649360656738


In [14]:
pred = model.predict((x_test1, x_test2))
print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))

f1_score on test dataset is 0.687913534485745
