### <font color='red'>Quora Question Pairs - Combined LSTM Model</font>

### Import required libraries

In [2]:
import pandas as pd
import itertools as itertools
import sklearn as skl
import numpy as np
import matplotlib.pyplot as plt
import nltk as nk

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, confusion_matrix

from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, Merge, Dense, Dropout, concatenate
from keras.optimizers import Adadelta
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

from nltk.corpus import stopwords

import re
import Levenshtein as leven
from gensim.models import KeyedVectors
from math import sqrt

ModuleNotFoundError: No module named 'Levenshtein'

### Import datasets and clean data, for practice.
The training dataset provided will be split into train-test to validate model's accuracy

In [3]:
## edit the link to be shorter and accessible from root folder
train = pd.read_csv(r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\train.csv')
test = pd.read_csv(r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\test.csv')
embedding_file = r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\GoogleNews-vectors-negative300.bin.gz'
model_dir = r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\Model'

## Exploratory Data Analysis (EDA)

In [4]:
train.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,404290.0,404290.0,404290.0,404290.0
mean,202144.5,217243.942418,220955.655337,0.369198
std,116708.614502,157751.700002,159903.182629,0.482588
min,0.0,1.0,2.0,0.0
25%,101072.25,74437.5,74727.0,0.0
50%,202144.5,192182.0,197052.0,0.0
75%,303216.75,346573.5,354692.5,1.0
max,404289.0,537932.0,537933.0,1.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


#### There are 2 null values under column question2, hence we will have to remove these 2 pairs of questions with null values.

In [7]:
train[train.isnull().any(axis=1)]
train = train.dropna(axis=0, how='any')

# Verify that rows with null values have been removed
train.info()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [6]:
# Count number of words in a question
def words(question):
    return len(question.split())

# Average length of a word in a question
def avg_word_length(question):
    total_words_len = 0
    no_of_words = 0
    question = question.split()
    for word in question:
        total_words_len += len(word)
        no_of_words += 1
    return total_words_len/no_of_words

# Number of characters in a question
def char_count(question):
    return len(question)

# Caps count of question, only takes into account whether the first character of each word is in uppercase
def caps_count(question):
    question = question.split()
    count = 0
    for word in question:
        if word[0].isupper():
            count += 1
    
    return count

# Jaccard Similiarity Coefficient
# Obtain the Jaccard Similiarity Coeefficient between 2 questions
# (X intersect Y) / (X union Y)
def jaccard_coeff(dataframe):
    question1 = dataframe['question1']
    question2 = dataframe['question2']
    question1 = question1.split(' ')
    question2 = question2.split(' ')
    shared_words = 0
    total_words = len(question1) + len(question2)
    
    for word1 in question1:
        for word2 in question2:
            if word1 == word2:
                shared_words += 1
    
    if (total_words-shared_words) == 0:
        return 1
    else:
        return shared_words/(total_words-shared_words)
    
# Levenshtein distance
# Obtain the Levenshtein distance between 2 questions
def levenshtein(dataframe):
    return leven.distance(dataframe['question1'], dataframe['question2'])

str

In [8]:
# Slicing imported dataframe into question1 series, question2 series and questions dataframe
q1 = train.iloc[:,3]
q2 = train.iloc[:,4]
q = train.iloc[:,3:5]
dup = train.iloc[:,5]

# Creating new features using feature engineering functions
word_len_diff = abs(q1.apply(words) - q2.apply(words))
avg_word_len_diff = abs(q1.apply(avg_word_length) - q2.apply(avg_word_length))
char_diff = abs(q1.apply(char_count) - q2.apply(char_count))
caps_diff = abs(q1.apply(caps_count) - q2.apply(caps_count))
jaccard = q.apply(jaccard_coeff, axis=1)
leven_dist = q.apply(levenshtein, axis=1)

# Creating a new dataframe with values of new features
classic_feat = pd.DataFrame({'word_len_diff': word_len_diff, 'avg_word_len_diff': avg_word_len_diff, 
                             'char_diff': char_diff, 'caps_diff': caps_diff, 'jaccard': jaccard, 
                             'leven_dist': leven_dist, 'duplicate': dup})
classic_feat = classic_feat[['word_len_diff', 'avg_word_len_diff', 'char_diff', 'caps_diff', 'jaccard', 'leven_dist', 'duplicate']]

# Create train = true/false boolean column for train-test split
classic_feat['is_train'] = np.random.uniform(0, 1, len(classic_feat)) <= .75

# Train-test dataframes split
train, test = classic_feat[classic_feat['is_train']==True], classic_feat[classic_feat['is_train']==False]

# Number of examplples for training and test dataframes
print('# of examples in the training data:', len(train))
print('# of examples in the test data:',len(test))

In [9]:
# Obtaining index of feature columns
features = classic_feat.columns[:6]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 21.6+ MB


In [None]:
# Obtaining y from the training data
y = train['duplicate']

## Data Munging
Derived from Elior Cohen's data cleaning process

In [10]:
def text2word(text):
    text = str(text).lower()

    # Text cleaning
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = text.split()
    return text


# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # placeholder for the [0, 0, ....0] embedding / padding
word2vec = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
stops = set(stopwords.words("english"))

questions_cols = ['question1', 'question2']

# Iterate over the questions only of both training and test datasets
for dataset in [train, test]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:
            q2n = []  # q2n -> question numbers representation
            
            for word in text2word(row[question]):

                # Check for unwanted words
                if word in stops and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                    
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.set_value(index, question, q2n)
            
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec

### Train-Validation data split for x and y values and zero padding of values to ensure consistency in shape of data

In [11]:
# To obtain the max length of the longest question in train/test datasets
max_seq_length = max(train.question1.map(lambda x: len(x)).max(),
                     train.question2.map(lambda x: len(x)).max(),
                     test.question1.map(lambda x: len(x)).max(),
                     test.question2.map(lambda x: len(x)).max())

# Split to train validation (80-20 split)
validation_size = 0.2
training_size = len(train)*(1-validation_size)

# Breaking dataframe values into x (question strings) and y (is_duplicate = 1/0)
x = train[questions_cols]
y = train['is_duplicate']

# Split data using train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=validation_size)

# Split to dictionaries for training, validation and test data
x_train = {'left': x_train.question1, 'right': x_train.question2}
x_validation = {'left': x_validation.question1, 'right': x_validation.question2}
x_test = {'left': test.question1, 'right': test.question2}

# Convert y-values (labels) to their numpy representations
y_train = y_train.values
y_validation = y_validation.values

# Zero padding
for dataset, side in itertools.product([x_train, x_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Checking consistent shapes for data using assert
assert x_train['left'].shape == x_train['right'].shape
assert len(x_train['left']) == len(y_train)

### Model building and training

In [12]:
# Model hyperparameters
hidden_layer_nodes = 50
batch_size = 64
epochs = 1

# declare left and right inputs
left_input = Input(shape=(max_seq_length, ), dtype='int32')
right_input = Input(shape=(max_seq_length, ), dtype='int32')

# Create an embedding layer to convert words to their embeddings
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Convert inputs into word embeddings
embedded_left = embedding_layer(left_input)
embedded_right = embedding_layer(right_input)

# lstm layer that will return an output the size of the number of hidden layer nodes
shared_lstm = LSTM(hidden_layer_nodes)

# run both inputs through shared lstm
encoded_left = shared_lstm(embedded_left)
encoded_right = shared_lstm(embedded_right)

# concatenate results of both encoded vectors
merged_vector = concatenate([encoded_left, encoded_right], axis=-1)

# finish off model with output layer
prediction = Dense(1, activation='relu')(merged_vector)

# Define model hyperparameters such as optimiser and loss function
model = Model(inputs=[left_input, right_input], outputs=[prediction])
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# summarize the model
print(model.summary())

# Start training
model_trained = model.fit([x_train['left'], x_train['right']], y_train, batch_size=batch_size, epochs=epochs,
                            validation_data=([x_validation['left'], x_validation['right']], y_validation), verbose=1)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 213)           0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 213)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 213, 300)      36427500    input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       embedding_1[0][0]       

### Evaluate model scalar test loss and accuracy

In [13]:
test_loss_accuracy = model.evaluate([x_validation['left'], x_validation['right']], y_validation)
print('Scalar test loss: ' + str(test_loss_accuracy[0]) + '\nModel accuracy: ' + str(test_loss_accuracy[1]))

Scalar test loss: 0.176279500335
Model accuracy: 0.740928541398
