In [9]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords 
import re 
import string 
import tensorflow as tf 
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
import numpy as np
import gc
from sklearn.model_selection import train_test_split

In [24]:
path = 'C:/Users/DELL/Downloads/contradictory-my-dear-watson'
train_data = pd.read_csv(path + '/train.csv')
test_data = pd.read_csv(path + '/test.csv')

In [25]:
train_data.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [78]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 25

In [47]:
def train_word2vec(documents, embedding_dim):
    """
    train word2vector over traning documents
    Args:
        documents (list): list of document
        embedding_dim (int): outpu wordvector size
    Returns:
        word_vectors(dict): dict containing words and their respective vectors
    """
    model = Word2Vec(documents, min_count=1, vector_size=embedding_dim)
    word_vectors = model.wv
    del model
    return word_vectors




In [48]:
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
    """
    Create embedding matrix containing word indexes and respective vectors from word vectors
    Args:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
        word_vectors (dict): dict containing word and their respective vectors
        embedding_dim (int): dimention of word vector
    Returns:
    """
    nb_words = len(tokenizer.word_index) + 1
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((nb_words, embedding_dim))
    print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
    for word, i in word_index.items():
        try:
            embedding_vector = word_vectors[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            print("vector not found for word - %s" % word)
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix




In [49]:
def word_embed_meta_data(documents, embedding_dim):
    """
    Load tokenizer object for given vocabs list
    Args:
        documents (list): list of document
        embedding_dim (int): embedding dimension
    Returns:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        embedding_matrix (dict): dict with word_index and vector mapping
    """
    documents = [x.lower().split() for x in documents]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(documents)
    word_vector = train_word2vec(documents, embedding_dim)
    embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
    del word_vector
    gc.collect()
    return tokenizer, embedding_matrix

In [50]:
premises = list(train_data['premise'])
hypothesises = list(train_data['hypothesis'])


In [51]:
tokenizer , embedding_matrix = word_embed_meta_data(premises  + hypothesises  , EMBEDDING_DIM) 

Embedding matrix shape: (64542, 300)
Null word embeddings: 1


In [160]:
def process_train_val_data(data , tokenizer ,max_sequence_length)  :
    labels = data.label 
    labels = np.array(labels)
    premises = data['premise']
    hypothesises = data['hypothesis']
    train_premises = tokenizer.texts_to_sequences(premises)
    train_hypothesises = tokenizer.texts_to_sequences(hypothesises)
    train_premises = pad_sequences(train_premises , maxlen = MAX_SEQUENCE_LENGTH)
    train_hypothesises = pad_sequences(train_hypothesises , maxlen = MAX_SEQUENCE_LENGTH)
    X_P_train , X_P_val , train_labels , val_labels = train_test_split(train_premises , labels  ,test_size= .15 ,random_state =123 )
    X_H_train , X_H_val , _ , _ = train_test_split(train_hypothesises , labels  ,test_size = .15 ,random_state =123)
    return X_P_train , X_P_val ,X_H_train , X_H_val ,train_labels , val_labels 


In [161]:
def process_test_data(data,tokenizer ,max_sequence_length):  
    premises = data['premise']
    hypothesises = data['hypothesis']
    test_premises = tokenizer.texts_to_sequences(premises)
    test_hypothesises = tokenizer.texts_to_sequences(hypothesises)
    X_P_test = pad_sequences(test_premises , maxlen = MAX_SEQUENCE_LENGTH)
    X_H_test = pad_sequences(test_hypothesises , maxlen = MAX_SEQUENCE_LENGTH)

    return X_P_test , X_H_test ,  


In [85]:
from tensorflow.keras.layers import Dense, Input, LSTM, Dropout, Bidirectional ,BatchNormalization , concatenate , Embedding

In [86]:
nb_words = len(tokenizer.word_index) + 1 

In [172]:
class siamese_model(tf.keras.Model) : 
    def __init__(self , embedding_dim, max_sequence_length,nb_words) : 
        super(siamese_model , self ).__init__() 
        # this the unified part of the network 
        self.embed = Embedding(nb_words , embedding_dim , weights=[embedding_matrix],
                                    input_length=max_sequence_length, trainable=False)
        self.lstm_1 = Bidirectional(LSTM(64 , return_sequences = True))
        self.lstm_dropout = Dropout(.5)
        self.lstm_2 = Bidirectional(LSTM(64))
        

        # rest of the model 
        self.dense_1 = Dense(256 , activation ='relu')
        self.batch_norm = BatchNormalization()
        self.drop_1 = Dropout(.5) 
        self.dense_2 = Dense(128 , activation ='relu')
        self.dense_3 = Dense(64 , activation ='relu')
        self.drop_2 = Dropout(.5)
        self.out = Dense(3 , activation = 'softmax')

    def call(self , inputs) : 
        premises , hypothesises = inputs[0] , inputs[1] 
        # premises part 
        X_P = self.embed(premises)
        X_P = self.lstm_1(X_P)
        X_P = self.lstm_dropout(X_P)
        X_P = self.lstm_2(X_P)
        
        # hypothesises part 
        X_H = self.embed(hypothesises)
        X_H = self.lstm_1(X_H)
        X_H = self.lstm_dropout(X_H)
        X_H = self.lstm_2(X_H)
        
        
        # concat 
        X_C = concatenate([X_P , X_H  ])
        X_C = self.dense_1(X_C)
        X_C = self.batch_norm(X_C)
        X_C = self.drop_1(X_C)
        X_C = self.dense_2(X_C)
        X_C = self.dense_3(X_C)
        X_C = self.drop_2(X_C)
        return  self.out(X_C)
        
        

In [165]:
X_P_train , X_P_val ,X_H_train , X_H_val ,train_labels , val_labels  =process_train_val_data(train_data,tokenizer ,MAX_SEQUENCE_LENGTH) 

In [166]:
 X_P_test , X_H_test  = process_test_data(test_data ,tokenizer ,MAX_SEQUENCE_LENGTH)

In [173]:
model = siamese_model(EMBEDDING_DIM ,MAX_SEQUENCE_LENGTH , nb_words ) 

In [174]:
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch : 1e-8 * 10**(epoch / 2)) 
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [175]:
optimizer = tf.keras.optimizers.Adam(lr = 1e-5)

  super(Adam, self).__init__(name, **kwargs)


In [176]:
model.compile(optimizer =optimizer , loss ='sparse_categorical_crossentropy' , metrics=['accuracy'])

In [177]:
Hist = model.fit([X_P_train , X_H_train  ] , train_labels , validation_data =([X_P_val , X_H_val ] , val_labels), 
                epochs =100 , callbacks =[lr_schedule , early_stop] , batch_size =64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
