# Quora question pairs

## 1. Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 2. Data preprocessing

In [None]:
# Load the training data
quora_train = pd.read_csv('/kaggle/input/quora-question/quora_train.csv')
quora_train = quora_train.drop('Unnamed: 0', axis = 1)
quora_train

In [None]:
# Load the test data
quora_test = pd.read_csv('/kaggle/input/quora-question/quora_test.csv')
quora_test = quora_test.drop('Unnamed: 0', axis = 1)
quora_test

In [None]:
# Drop the rows that have missing values
quora_train = quora_train.dropna(axis = 0).reset_index(drop = True)
quora_test = quora_test.dropna(axis = 0).reset_index(drop = True)

quora_train

In [None]:
quora_train['question1'] = quora_train['question1'].astype('str')
quora_train['question2'] = quora_train['question2'].astype('str')
quora_test['question1'] = quora_test['question1'].astype('str')
quora_test['question2'] = quora_test['question2'].astype('str')

In [None]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [None]:
quora_train['question1'] = quora_train['question1'].apply(lambda s: text_to_wordlist(s))
quora_train['question2'] = quora_train['question2'].apply(lambda s: text_to_wordlist(s))
quora_test['question1'] = quora_test['question1'].apply(lambda s: text_to_wordlist(s))
quora_test['question2'] = quora_test['question2'].apply(lambda s: text_to_wordlist(s))

## 3. Data explanatory analysis

In [None]:
# The number of duplicate and distinct question pairs
quora_train['is_duplicate'].value_counts()

In [None]:
# Number of questions that occur more than once
question_count = pd.concat([quora_train['qid1'], quora_train['qid2']]).value_counts().values
print('There are ' + str(sum(question_count > 1)) + ' repeated questions.')

In [None]:
# Size of the plot
plt.figure(figsize=(12, 5))

# Distribution of the times each question occurs
plt.hist(question_count, bins=50)

# Take logarithm of y-value
plt.yscale('log', nonposy='clip')

plt.title('Log-Histogram of question appearance counts')

# Set axis labels
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')

# Show the plot
plt.show()

In [None]:
# The length of queation1 and question2
sizeQuestionOne = quora_train['question1'].str.len().tolist()
sizeQuestionTwo = quora_train['question2'].str.len().tolist()

min(sizeQuestionOne), min(sizeQuestionTwo), max(sizeQuestionOne), max(sizeQuestionTwo)

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Distribution of the length of question 1
plt.hist(sizeQuestionOne, bins=100, range=[0, 300])

plt.title('Histogram of character count in questions', fontsize=13)

# Set axis labels
plt.xlabel('Number of characters', fontsize=13)
plt.ylabel('Number of questions', fontsize=13)

# Show the plot
plt.show()

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Distribution of the length of question 2
plt.hist(sizeQuestionTwo,bins=100, range=[0, 300])

plt.title('Histogram of character count in questions', fontsize=13)

# Set axis labels
plt.xlabel('Number of characters', fontsize=13)
plt.ylabel('Number of questions', fontsize=13)

# Show the plot
plt.show()

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Relationship between the length of question 1 and question 2
plt.scatter(sizeQuestionOne, sizeQuestionTwo)

plt.title('Correlation of character count in question1 and question2', fontsize=13)

# Set axis labels
plt.xlabel('Number of characters of question1', fontsize=13)
plt.ylabel('Number of characters of question2', fontsize=13)

# Show the plot
plt.show()

In [None]:
# The number of words in question 1 and question 2
wordQuestionOne = quora_train['question1'].apply(lambda s: len(s.split(" ")))
wordQuestionTwo = quora_train['question2'].apply(lambda s: len(s.split(" ")))

min(wordQuestionOne), min(wordQuestionTwo), max(wordQuestionOne), max(wordQuestionTwo)

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Distribution of the number of words in question 1
plt.hist(wordQuestionOne, bins=100, range=[0, 60])

plt.title('Histogram of word count in questions', fontsize=13)

# Set axis labels
plt.xlabel('Number of words', fontsize=13)
plt.ylabel('Number of questions', fontsize=13)

# Show the plot
plt.show()

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Distribution of the number of words in question 2
plt.hist(wordQuestionTwo, bins=100, range=[0, 60])

plt.title('Histogram of word count in questions', fontsize=13)

# Set axis labels
plt.xlabel('Number of words', fontsize=13)
plt.ylabel('Number of questions', fontsize=13)

# Show the plot
plt.show()

In [None]:
# Size of the plot
plt.figure(figsize=(8, 6))

# Relationship between the number of words in question1 and question2
plt.scatter(wordQuestionOne, wordQuestionTwo)

plt.title('Correlation of word count in question1 and question2', fontsize=13)

# Set axis labels
plt.xlabel('Number of words of question1', fontsize=13)
plt.ylabel('Number of words of question2', fontsize=13)

# Show the plot
plt.show()

In [None]:
def normalized_word_share(row):
    '''
    Compute the proportion of the same words in two texts
    '''
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return 1.0 * len(w1 & w2)/(len(w1) + len(w2))

wordShare = quora_train.apply(normalized_word_share, axis=1)

In [None]:
# Size of the plot
plt.figure(figsize=(14, 8))

plt.subplot(1,2,1)

# Distribution of word share for duplicate pairs and distinct pairs
sns.violinplot(x = quora_train['is_duplicate'], y = wordShare)

plt.title('Percentage of word in common between question1 and question2', fontsize=13)

# Set axis labels
plt.xlabel('is_duplicate', fontsize=13)
plt.ylabel('Percentage of word share', fontsize=13)


plt.subplot(1,2,2)
sns.distplot(wordShare[quora_train['is_duplicate'] == 1.0], color = 'green', label = 'is_duplicate = 1')
sns.distplot(wordShare[quora_train['is_duplicate'] == 0.0], color = 'red', label = 'is_duplicate = 0')

plt.title('Distribution of word share between question1 and question2', fontsize=13)

# Set axis labels
plt.xlabel('Percentage of word share', fontsize=13)
plt.ylabel('Probability', fontsize=13)

plt.legend()

# Show the plot
plt.show()

## 4. Modeling

In [None]:
# Parameters
MAX_SEQUENCE_LENGTH = 30 # Only the first 30 words in a text are taken into account
MAX_NB_WORDS = 200000 # Only the most common 200,000 words are to be tokenized
EMBEDDING_DIM = 300 # Dimension of embeddings

num_lstm = 205 # The output dimensionality of the LSTM layer
num_dense = 125 # The number of hidden units of the Dense layer
rate_drop_dense = 0.15 # The dropout rate of the Dropout layer

In [None]:
# Tokenize the words in all questions
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(quora_train['question1'].\
                       append([quora_train['question2'], quora_test['question1'], quora_test['question2']]))

sequences_1 = tokenizer.texts_to_sequences(quora_train['question1'])
sequences_2 = tokenizer.texts_to_sequences(quora_train['question2'])
test_sequences_1 = tokenizer.texts_to_sequences(quora_test['question1'])
test_sequences_2 = tokenizer.texts_to_sequences(quora_test['question2'])

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

In [None]:
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(quora_train['is_duplicate'])
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Index word vectors
word2vec = KeyedVectors.load_word2vec_format('/kaggle/input/embedding/GoogleNews-vectors-negative300.bin', binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

# Prepare embeddings
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
# Define the model structure
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],\
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)

lstm_layer = LSTM(num_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation= 'relu')(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [None]:
# Sample train/validation data
VALIDATION_SPLIT = 0.2 # The proportion of validation data
perm = np.random.permutation(len(data_1)) # Shuffle the index
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = data_1[idx_train]
data_2_train = data_2[idx_train]
labels_train = labels[idx_train]

data_1_val = data_1[idx_val]
data_2_val = data_2[idx_val]
labels_val = labels[idx_val]

# Whether to re-weight classes to fit the 17.5% share in test set
re_weight = True

# Add class weight
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [None]:
# Train the model
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

hist = model.fit([data_1_train, data_2_train], labels_train, \
                 validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
                 epochs=10, batch_size=128, shuffle=True, \
                 class_weight=class_weight)

In [None]:
# The accuracy of the training data and validation data
acc = hist.history['acc']
val_acc = hist.history['val_acc']

# Loss of the training data and validation data
loss = hist.history['loss']
val_loss = hist.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')

# Set axis labels
plt.xlabel('epoch', fontsize=13)
plt.ylabel('accuracy', fontsize=13)

plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')

# Set axis labels
plt.xlabel('epoch', fontsize=13)
plt.ylabel('accuracy', fontsize=13)

plt.title('Training and validation loss')

plt.legend()

# Show the plot
plt.show()

In [None]:
# Run the model on test data
model.evaluate(x = [test_data_1, test_data_2], y = quora_test['is_duplicate'])

In [None]:
quora_test['prob'] = model.predict([test_data_1, test_data_2])

In [None]:
# Precision and recall
nn_precision, nn_recall, _ = precision_recall_curve(quora_test['is_duplicate'], quora_test['prob'])

# f-value
nn_f = 2*nn_precision*nn_recall / (nn_precision+nn_recall)

# Plot precision-recall curve
plt.plot(nn_recall, nn_precision, marker='.')

# Set axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('The precision-recall curve')

# Show the plot
plt.show()

In [None]:
# The precision, recall and f-value curve
plt.plot(_, nn_precision[:-1], label = 'precision')
plt.plot(_, nn_recall[:-1], label = 'recall')
plt.plot(_, nn_f[:-1], label = 'f-value')

plt.legend(loc = 'lower center')

# Set axis label
plt.xlabel('threshold')

plt.title('The precision/ recall/ f-value curve')

# Show the plot
plt.show()

In [None]:
quora_test['pred'] = 0
quora_test.loc[quora_test['prob'] > 0.38, 'pred'] = 1 # The predicted label

quora_test

In [None]:
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    
    B =(C/C.sum(axis=0))

    plt.figure(figsize=(20,4))
    
    labels = [0,1]
    # representing A in heatmap format
    cmap=sns.light_palette("blue")
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing B in heatmap format
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()
    
plot_confusion_matrix(quora_test['is_duplicate'], quora_test['pred'])

In [None]:
# Compute the accuracy, precision, recall, and f-value given the threshold
print(accuracy_score(quora_test['is_duplicate'], quora_test['pred']))
print(precision_score(quora_test['is_duplicate'], quora_test['pred']))
print(recall_score(quora_test['is_duplicate'], quora_test['pred']))
print(f1_score(quora_test['is_duplicate'], quora_test['pred']))

In [None]:
nn_fpr, nn_tpr, _ = roc_curve(quora_test['is_duplicate'], quora_test['prob'])

# Plot ROC curve
plt.plot(nn_fpr, nn_tpr, marker='.')

# Set axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('The ROC curve')

# Show the plot
plt.show()

In [None]:
# Compute AUC
nn_auc = roc_auc_score(quora_test['is_duplicate'], quora_test['prob'])
print('ROC AUC=%.3f' % (nn_auc))