In [1]:
import gensim
import pandas as pd
from scipy.spatial.distance import cosine
from nltk import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

Load word2vec using gensim:

In [2]:
model_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Load data:

In [3]:
df = pd.read_csv('train.csv')
df['question2'] = df['question2'].astype(str)
df['question1'] = df['question1'].astype(str)

Define function for feature extraction based on rows:

In [19]:
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def w2v_get(words):
    cleaned = [w for w in words if w in model_w2v.vocab]
    try:
        res = np.mean(model_w2v[cleaned], axis=0)
    except:
        res = np.zeros(300)
    return res


def w2v_distance(row):
    q1v = w2v_get(word_tokenize(row.question1))
    q2v = w2v_get(word_tokenize(row.question2))
    return cosine(q1v, q2v)

def w2v_distance2(row):
    q1 = word_tokenize(row.question1)
    q2 = word_tokenize(row.question2)
    return model_w2v.wmdistance(q1, q2)


def tfidf_dis(row):
    try:
        vectorizer = TfidfVectorizer(stop_words='english')
        vec = vectorizer.fit_transform([row['question1'], row['question2']])
        vec = vec.todense()
        return cosine(vec[0], vec[1])
    except:
        return 1.0

def jaccard(row):
    s1 = set(row['question1'])
    s2 = set(row['question2'])
    return 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))

def len_diff(row):
    l1 = len(row['question1'])
    l2 = len(row['question2'])
    return abs(l1 - l2) / (l1 + l2)

Extract features:

In [7]:
df['w2v_dist'] = df.apply(w2v_distance, axis=1)
df['w2v_dist2'] = df.apply(w2v_distance2, axis=1)
df['tfidf_dist'] = df.apply(tfidf_dis, axis=1)
df['jaccard_dist'] = df.apply(jaccard, axis=1)
df['len_diff'] = df.apply(len_diff, axis=1)
df['word_share'] = df.apply(word_match_share, axis=1)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [21]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,w2v_dist,w2v_dist2,tfidf_dist,jaccard_dist,len_diff,word_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.042931,0.470209,0.104468,0.0,0.073171,0.727273
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.314483,2.118396,0.525669,0.375,0.266187,0.307692
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.229717,1.991613,0.774235,0.3,0.106061,0.363636
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.520283,3.080408,1.0,0.575758,0.130435,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.32083,2.409782,0.793916,0.333333,0.321739,0.0


Set training set and validation set:

In [22]:
x_train = df[['w2v_dist', 'w2v_dist2', 'tfidf_dist', 'jaccard_dist', 'len_diff','word_share']]
y_train = df['is_duplicate']

from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

Use XGBoost to train the binary classifier:

In [23]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.03
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-logloss:0.68218	valid-logloss:0.682283
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.600555	valid-logloss:0.601491
[20]	train-logloss:0.550806	valid-logloss:0.552397
[30]	train-logloss:0.518355	valid-logloss:0.520375
[40]	train-logloss:0.496495	valid-logloss:0.498773
[50]	train-logloss:0.481461	valid-logloss:0.483934
[60]	train-logloss:0.470818	valid-logloss:0.473504
[70]	train-logloss:0.463094	valid-logloss:0.465958
[80]	train-logloss:0.457482	valid-logloss:0.460474
[90]	train-logloss:0.453445	valid-logloss:0.456554
[100]	train-logloss:0.450511	valid-logloss:0.453736
[110]	train-logloss:0.448144	valid-logloss:0.45146
[120]	train-logloss:0.446187	valid-logloss:0.449612
[130]	train-logloss:0.444642	valid-logloss:0.448153
[140]	train-logloss:0.443297	valid-logloss:0.44688
[150]	train-logloss:0.44228	valid-logloss:0.44592
[160]	train-logloss:0.441391	vali

In [47]:
# Deprecated

def w2v_distance(q1, q2):
    q1v = w2v_get(q1)
    q2v = w2v_get(q2)
    return cosine(q1v, q2v)

df['w2v_dist'] = np.nan
for i, row in df.iterrows():
    q1 = word_tokenize(row.question1)
    q2 = word_tokenize(row.question2)
    df.set_value(i,'w2v_dist',w2v_distance(q1, q2))