In [1]:
#Let's start by importing our csv's as DataFrames
import pandas as pd
import numpy as np
from tqdm import tqdm

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print('Training: ', df_train.shape)
print('Test: ', df_test.shape)

Training:  (404290, 6)
Test:  (2345796, 3)


In [2]:
#Print info about our data frame
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
#It's a good idea to take only a small sample of the data
#We can remove this block when we are ready to test the whole dataset
sample_size = 10000
true_questions = df_train[df_train.is_duplicate == 1]
false_questions = df_train[df_train.is_duplicate == 0]

df_train = pd.concat([true_questions.sample(sample_size), false_questions.sample(sample_size)])
df_train = df_train.sample(df_train.shape[0])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 342265 to 98438
Data columns (total 6 columns):
id              20000 non-null int64
qid1            20000 non-null int64
qid2            20000 non-null int64
question1       20000 non-null object
question2       20000 non-null object
is_duplicate    20000 non-null int64
dtypes: int64(4), object(2)
memory usage: 1.1+ MB


Okay, let's get started extracting some simple (and possible insightful) features

(Side note): When using the axis parameter for apply method.
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
0 or ‘index’: apply function to each column
1 or ‘columns’: apply function to each row

In [4]:
df_train['q1_feat_length'] = df_train['question1'].apply(lambda x: len(str(x)))
df_train['q2_feat_length'] = df_train['question2'].apply(lambda x: len(str(x)))

df_train['feat_length_diff'] = df_train['q1_feat_length'] - df_train['q2_feat_length']

df_train['q1_feat_num_chars'] = df_train['question1'].apply(lambda x: len(''.join(set(str(x).replace(' ','')))))
df_train['q2_feat_num_chars'] = df_train['question2'].apply(lambda x: len(''.join(set(str(x).replace(' ','')))))

df_train['q1_feat_num_words'] = df_train['question1'].apply(lambda x: len(str(x).split()))
df_train['q2_feat_num_words'] = df_train['question2'].apply(lambda x: len(str(x).split()))

df_train['feat_words_in_common'] = df_train.apply(lambda x: len(set(str(x['question1']).lower().split()) \
                                                                   .intersection(str(x['question2']).lower().split())), axis=1)

#Now we will use an awesome string-matching library called FuzzyWuzzy. http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ 
#Fuzzywuzzy uses Levenshtein Distance to calculate differences between sequences.
#We will use QRatio: Q stands for quick. Quick ratio comparison between two strings.
#Returns a similarity ratio from 0-100
from fuzzywuzzy import fuzz

df_train['feat_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']),str(x['question2'])), axis=1)

#WRatio: Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
df_train['feat_wratio'] = df_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

#We can add more features from fuzzywuzzy, but we will move on for now

We will now use genism in order to build word embeddings (word vectors) based off the Google News corpus

In [7]:
import gensim
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#wmd, or Word Mover's Distance, gives us a numerical distance based off how similar words are to eachother
#This is different from cosign similarity (used in bag-of-words),
# since it does not capture similar sentances that say the same thing in completely different words.
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
df_train['wmd'] = df_train.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)



ImportError: Please install pyemd Python package to compute WMD.

Now, we will generate sentance vectors based on the word vectors that were created above.

For each question in the dataframe, we use "word_tokenize" from nltk to tokenize each word.
Each word is then put through the pre-trained model, and appended to the temporary array M.
Lastly, we sum all the words into a single value, "v", and perform some arithmatic.

√(v^2)

In [None]:
from nltk import word_tokenize

def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


question1_vectors = np.zeros((df_train.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(df_train.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((df_train.shape[0], 300))
for i, q in tqdm(enumerate(df_train.question2.values)):
    question2_vectors[i, :] = sent2vec(q)


In [None]:
model.most_similar("sugar")