In [5]:
#Let's start by importing our csv's as DataFrames
import pandas as pd

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print('Training: ', df_train.shape)
print('Test: ', df_test.shape)

Training:  (404290, 6)
Test:  (2345796, 3)


In [6]:
#Print info about our data frame
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
#It's a good idea to take only a small sample of the data
#We can remove this block when we are ready to test the whole dataset
sample_size = 10000
true_questions = df_train[df_train.is_duplicate == 1]
false_questions = df_train[df_train.is_duplicate == 0]

df_train = pd.concat([true_questions.sample(sample_size), false_questions.sample(sample_size)])
df_train = df_train.sample(df_train.shape[0])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 226657 to 386555
Data columns (total 6 columns):
id              20000 non-null int64
qid1            20000 non-null int64
qid2            20000 non-null int64
question1       20000 non-null object
question2       20000 non-null object
is_duplicate    20000 non-null int64
dtypes: int64(4), object(2)
memory usage: 1.1+ MB


Okay, let's get started extracting some simple (and possible insightful) features

(Side note): When using the axis parameter for apply method.
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
0 or ‘index’: apply function to each column
1 or ‘columns’: apply function to each row

In [13]:
df_train['q1_feat_length'] = df_train['question1'].apply(lambda x: len(str(x)))
df_train['q2_feat_length'] = df_train['question2'].apply(lambda x: len(str(x)))

df_train['feat_length_diff'] = df_train['q1_feat_length'] - df_train['q2_feat_length']

df_train['q1_feat_num_chars'] = df_train['question1'].apply(lambda x: len(''.join(set(str(x).replace(' ','')))))
df_train['q2_feat_num_chars'] = df_train['question2'].apply(lambda x: len(''.join(set(str(x).replace(' ','')))))

df_train['q1_feat_num_words'] = df_train['question1'].apply(lambda x: len(str(x).split()))
df_train['q2_feat_num_words'] = df_train['question2'].apply(lambda x: len(str(x).split()))

df_train['feat_num_matching_words'] = df_train.apply(lambda x: len(set(str(x['question1']).lower().split()) \
                                                                   .intersection(str(x['question2']).lower().split())), axis=1)

#Now we will use an awesome string-matching library called FuzzyWuzzy. http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ 
#It has a lot of useful helper methods for predicting sentence matching
#We will use QRatio: Q stands for quick. Quick ratio comparison between two strings.
#Returns a similarity ratio from 0-100
from fuzzywuzzy import fuzz

df_train['feat_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']),str(x['question2'])), axis=1)

#WRatio: Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
df_train['feat_wratio'] = df_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)


In [21]:
#Firstly, we should clean the data. This will give our model the best chance to identify duplicates accurately. 
#This means removing ambiguous stop words.

import nltk
nltk.download("stopwords")
from tqdm import tqdm #for progressbar

def extractStopWords(question):
    stops = set(stopwords.words("english"))
    words = question.split()
    meaningful_words = [w.lower() for w in words if not w in stops]
    return meaningful_words      
        
tqdm.pandas(mininterval=2,ncols=60, desc='Question1 Progress')
df_train["cleaned_q1"] = df_train.question1.progress_apply(extractStopWords)

tqdm.pandas(mininterval=2,ncols=60, desc='Question2 Progress')
df_train["cleaned_q2"] = df_train.question2.progress_apply(extractStopWords)



Question1 Progress:   0%| | 1/20000 [02:05<698:57:19, 125.82s/it]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\G4M3M4ST3R\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.




Question1 Progress:   0%|         | 0/20000 [00:00<?, ?it/s]

Question1 Progress:  34%|▎| 6732/20000 [00:02<00:03, 3365.18it/s]

Question1 Progress:  67%|▋| 13413/20000 [00:04<00:01, 3357.70it/s]

Question1 Progress: 100%|█| 20000/20000 [00:05<00:00, 3356.81it/s]

Question2 Progress:   0%|         | 0/20000 [00:00<?, ?it/s]

Question2 Progress:  33%|▎| 6667/20000 [00:02<00:04, 3332.69it/s]

Question2 Progress:  66%|▋| 13297/20000 [00:04<00:02, 3327.13it/s]

Question2 Progress:  99%|▉| 19806/20000 [00:06<00:00, 3304.76it/s]

Question2 Progress: 100%|█| 20000/20000 [00:06<00:00, 3296.82it/s]

In [24]:
#Now we can finally begin constructing our CNN
#Let's first create word vectors for our questions
import gensim

questions = list(df_train['cleaned_q1']) + list(df_train['cleaned_q2']) 

print ("Training model...")
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
df_train['wmd'] = df_train.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
print ("Training Complete!")

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.

# creta a dict 
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
print ("Number of tokens in Word2Vec:", len(w2v.keys()))

Training model...
Training Complete!
Number of tokens in Word2Vec: 6668


In [25]:
model.most_similar("sugar")

[('60', 0.986575722694397),
 ('food,', 0.9854674339294434),
 ('ages', 0.9844881892204285),
 ('weak', 0.983527660369873),
 ('worried', 0.9834195971488953),
 ('overweight', 0.9833500385284424),
 ('hand?', 0.9832106828689575),
 ('emma', 0.9830905795097351),
 ('persons', 0.9826271533966064),
 ('walk', 0.9822831749916077)]