In [140]:
import pandas as pd
import nltk
from nltk import word_tokenize

from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split

import os

In [141]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

#### Checking how many NaNs there are in the dataset

In [143]:
len(train_data) - train_data.count()

id              0
question1       1
question2       3
is_duplicate    0
dtype: int64

In [144]:
train_data.dropna(inplace=True)

In [146]:
# tokenize questions
train_data.question1 = train_data.question1.apply(word_tokenize)
train_data.question2 = train_data.question2.apply(word_tokenize)

In [148]:
stemmer=SnowballStemmer("english")
#example: stemmer.stem("having") -> have

In [149]:
# tokenize questions
steeming = lambda tokens: [stemmer.stem(token) for token in tokens]
train_data.question1 = train_data.question1.apply(steeming)
train_data.question2 = train_data.question2.apply(steeming)

In [150]:
wh = ['what', 'where', 'what', 'why', 'which', 'how']
def contain5W1H(tokens):
    return any([stemmer.stem(x) in tokens for x in wh])

def which5W1H(tokens):
    #print([stemmer.stem(x) for x in wh if stemmer.stem(x) in tokens][0])
    for x in wh:
        s = stemmer.stem(x) 
        if s in tokens:
            return s

def get_features(q1, q2):
    output = {}
    tokens1 = list(set(q1))
    tokens2 = list(set(q2))
    output['similar_words'] = sum([word in tokens2 for word in tokens1])
    output['both_contain_5w-1h'] = contain5W1H(q1) and contain5W1H(q2)
    output['same_5w-1h'] = which5W1H(q1) == which5W1H(q2)
    output['length_q1'] = len(q1)
    output['length_q2'] = len(q2)
    return output

In [151]:
# Example
q1 = train_data.question1.at[6]
q2 = train_data.question2.at[6]
print(q1)
print(q2)
get_features(q1, q2)

['should', 'i', 'buy', 'tiago', '?']
['what', 'keep', 'childern', 'activ', 'and', 'far', 'from', 'phone', 'and', 'video', 'game', '?']


{'similar_words': 1,
 'both_contain_5w-1h': False,
 'same_5w-1h': False,
 'length_q1': 5,
 'length_q2': 12}

In [152]:
featuresets = [(get_features(row.question1, row.question2), row.is_duplicate) for (index, row) in train_data.iterrows()]
train, test = train_test_split(featuresets, test_size=0.33, random_state=42)

In [153]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [154]:
print(nltk.classify.accuracy(classifier, test))

0.6596588618099641
