In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize

from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split

import sys
sys.path.insert(0, '../common/')
import csv_utils

import os
import spacy

In [2]:
path = os.path.join('..','data','train_data_v2.csv')
train_data = pd.read_csv(path)

path = os.path.join('..','data','test_data.csv')
test_data = pd.read_csv(path)

#### Checking how many NaNs there are in the dataset

In [3]:
# tokenize questions
train_data.question1 = train_data.question1.apply(word_tokenize)
train_data.question2 = train_data.question2.apply(word_tokenize)

test_data.question1 = test_data.question1.apply(word_tokenize)
test_data.question2 = test_data.question2.apply(word_tokenize)

In [4]:
stemmer=SnowballStemmer("english")
#example: stemmer.stem("having") -> have

In [5]:
# tokenize questions
steming = lambda tokens: [stemmer.stem(token) for token in tokens]
train_data.question1 = train_data.question1.apply(steming)
train_data.question2 = train_data.question2.apply(steming)
test_data.question1 = test_data.question1.apply(steming)
test_data.question2 = test_data.question2.apply(steming)

In [6]:
wh = ['what', 'where', 'what', 'why', 'which', 'how']
def contain5W1H(tokens):
    return any([stemmer.stem(x) in tokens for x in wh])

def which5W1H(tokens):
    #print([stemmer.stem(x) for x in wh if stemmer.stem(x) in tokens][0])
    for x in wh:
        s = stemmer.stem(x) 
        if s in tokens:
            return s

def get_features(q1, q2):
    output = {}
    tokens1 = list(set(q1))
    tokens2 = list(set(q2))
    output['similar_words'] = sum([word in tokens2 for word in tokens1])
    #output['both_contain_5w-1h'] = contain5W1H(q1) and contain5W1H(q2)
    output['same_5w-1h'] = which5W1H(q1) == which5W1H(q2)
    output['absolute_difference_of_lenth'] = abs(len(q1) - len(q2))
    #output['length_q1'] = len(q1)
    #output['length_q2'] = len(q2)
    return output

In [7]:
# Example
q1 = train_data.question1.at[6]
q2 = train_data.question2.at[6]
print(q1)
print(q2)
get_features(q1, q2)

['should', 'i', 'buy', 'tiago', '?']
['what', 'keep', 'childern', 'activ', 'and', 'far', 'from', 'phone', 'and', 'video', 'game', '?']


{'similar_words': 1, 'same_5w-1h': False, 'absolute_difference_of_lenth': 7}

In [8]:
featuresets = [(get_features(row.question1, row.question2), row.is_duplicate) for (index, row) in train_data.iterrows()]
train, test = train_test_split(featuresets, test_size=0.33, random_state=42)

In [9]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [10]:
print(nltk.classify.accuracy(classifier, test))

0.6507225040555873


In [11]:
classifier.show_most_informative_features(5)

Most Informative Features
           similar_words = 1                   0 : 1      =   1805.6 : 1.0
absolute_difference_of_lenth = 28                  0 : 1      =     39.9 : 1.0
absolute_difference_of_lenth = 31                  0 : 1      =     35.3 : 1.0
absolute_difference_of_lenth = 30                  0 : 1      =     31.9 : 1.0
absolute_difference_of_lenth = 32                  0 : 1      =     21.9 : 1.0


In [12]:
real_classifier = nltk.NaiveBayesClassifier.train(featuresets)

In [13]:
test_featureset = [get_features(row.question1, row.question2) for (index, row) in test_data.iterrows()]

In [14]:
predicted = [real_classifier.classify(t) for t in test_featureset]

In [15]:
csv_utils.create_csvs(predicted, test_data.test_id.values)

saved in:  /home/zenbook/Work/github/quora_npl/models/../data/submissions/submission_0603PM-November-23-2018.csv
