In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [None]:
df = pd.read_csv('clean.csv')

In [2]:
df = pd.read_csv('train.csv')
df['question2'] = df['question2'].astype(str)
df['question1'] = df['question1'].astype(str)

In [2]:


def stemming(row):
    stops = set(stopwords.words("english"))
    tokens = word_tokenize(row)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens if word not in stops]
    return stemmed

def tfidf_tokenizer(row):
    tokens = word_tokenize(row)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    return stemmed

In [34]:
df['q1_stemmed'] = df.question1.apply(stemming)
df['q2_stemmed'] = df.question2.apply(stemming)

In [47]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_stemmed,q2_stemmed,tfidf_dist
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, step, step, guid, invest, share, market...","[what, step, step, guid, invest, share, market...",0.09445
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, stori, kohinoor, (, koh-i-noor, ), diam...","[what, would, happen, indian, govern, stole, k...",0.430166
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, I, increas, speed, internet, connect, us...","[how, internet, speed, increas, hack, dn, ?]",0.549732
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[whi, I, mental, lone, ?, how, I, solv, ?]","[find, remaind, [, math, ], 23^, {, 24, }, [, ...",0.9087
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolv, water, quikli, sugar, ,,...","[which, fish, would, surviv, salt, water, ?]",0.771173


In [2]:
model_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

OSError: Not a gzipped file (b've')

In [52]:
# from scipy.spatial.distance import cosine
# from sklearn.feature_extraction.text import TfidfVectorizer
# def tfidf_dis(row):
#     try:
#         vectorizer = TfidfVectorizer(tokenizer=tfidf_tokenizer, stop_words='english')
#         vec = vectorizer.fit_transform([row['question1'], row['question2']])
#         vec = vec.todense()
#         return cosine(vec[0], vec[1])
#     except:
#         return 1.0

# df['tfidf_dist'] = df.apply(tfidf_dis, axis=1)

In [None]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['q1_stemmed']:
        q1words[word] = 1
    for word in row['q2_stemmed']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def w2v_get(words):
    cleaned = [w for w in words if w in model_w2v.vocab]
    try:
        res = np.mean(model_w2v[cleaned], axis=0)
    except:
        res = np.zeros(300)
    return res


def w2v_distance(row):
    q1v = w2v_get(row.q1_stemmed)
    q2v = w2v_get(row.q2_stemmed)
    return cosine(q1v, q2v)

def w2v_distance2(row):
    q1 = row.q1_stemmed
    q2 = row.q2_stemmed
    return model_w2v.wmdistance(q1, q2)


def tfidf_dis(row):
    try:
        vectorizer = TfidfVectorizer(tokenizer=tfidf_tokenizer, stop_words='english')
        vec = vectorizer.fit_transform([row['question1'], row['question2']])
        vec = vec.todense()
        return cosine(vec[0], vec[1])
    except:
        return 1.0

def jaccard(row):
    s1 = set(row['q1_stemmed'])
    s2 = set(row['q2_stemmed'])
    return 1 - (len(s1.intersection(s2)) / len(s1.union(s2)))

def len_diff(row):
    l1 = len(row['question1'])
    l2 = len(row['question2'])
    return abs(l1 - l2) / (l1 + l2)

In [None]:
df['w2v_dist'] = df.apply(w2v_distance, axis=1)
df['w2v_dist2'] = df.apply(w2v_distance2, axis=1)
# df['tfidf_dist'] = df.apply(tfidf_dis, axis=1)
df['jaccard_dist'] = df.apply(jaccard, axis=1)
df['len_diff'] = df.apply(len_diff, axis=1)
df['word_share'] = df.apply(word_match_share, axis=1)

In [46]:
df.to_csv('clean.csv')