In [1]:
import string
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize, regexp_tokenize
from autocorrect import spell
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from itertools import chain
from gensim.models.doc2vec import TaggedDocument
import numpy as np
from numpy.random import randn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
import gensim
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB                                                                
from sklearn.cross_validation import train_test_split 
from sklearn.metrics import accuracy_score



In [2]:
df_train = pd.read_csv("train.csv")

In [3]:
sentence_1a = df_train['question1']
sentence_1b = df_train['question2']
is_duplicate = df_train['is_duplicate']
id_ = df_train['id']
qid1 = df_train['qid1']
qid2 = df_train['qid2']

In [4]:
STOPWORDS = stopwords.words('english')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_list = list(string.punctuation)

def spelling_correction(question):
    question_updated = ""
    #for token in word_tokenize(question):
    for token in tokenizer.tokenize(question):
        word = ''.join(ch for ch in token if ch not in punctuation_list and ch.isalnum())
        question_updated =  question_updated + word + " "
    return question_updated

def remove_stopword_questions(question):
    temp_list = [token for token in tokenizer.tokenize(question) if token not in stopwords.words('english')]
    return ' '.join(temp_list)

def get_preprocessed_tokens(question):  
    question = spelling_correction(question)
    question = question.lower()
    question = remove_stopword_questions(question)
    question = [word for word in tokenizer.tokenize(question)]
    return question

In [5]:
def lemmatize_tokens(question):
    joined_tokens = []
    lemma_question = ""
    for token in get_preprocessed_tokens(question):
        token_lemma = wordnet_lemmatizer.lemmatize(token)
        joined_tokens.append(token_lemma)
    return joined_tokens

In [6]:
taggeddocs = []
tag2questionmap = {} 
df_doc2vec = pd.DataFrame()
for c_id, id1 , id2 , q1 , q2, label in zip(id_, qid1, qid2 , sentence_1a, sentence_1b, is_duplicate):
    questions = [q1, q2]
    try:
        cleaned_questions = []
        for question in questions:
            words = lemmatize_tokens(question)
            cleaned_question = " ".join(w for w in words)
            cleaned_questions.append(cleaned_question)
        df_doc2vec_temp = pd.DataFrame({'index':[c_id], 'cleaned_question1': [cleaned_questions[0]], 'cleaned_question2': [cleaned_questions[1]],\
                                        'is_duplicate': [label],'question1': [q1], 'question2': [q2]})
        df_doc2vec = pd.concat([df_doc2vec, df_doc2vec_temp])
        for index,i in enumerate(cleaned_questions):
            #if len(i) > 2 : # Non empty tweets
            #print i
            if index == 0: 
                tag = u'SENT_{:d}'.format(id1)
            else:
                tag = u'SENT_{:d}'.format(id2)
            sentence = TaggedDocument(words=gensim.utils.to_unicode(i).split(), tags=[tag])
            #print sentence
            tag2questionmap[tag] = i
            taggeddocs.append(sentence)       
    except:
        print c_id
    
    

#model = gensim.models.Doc2Vec(taggeddocs, dm=0, alpha=0.025, size=100, min_alpha=0.025, min_count=2)
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=2, alpha=0.1)
model.build_vocab(taggeddocs)
%time model.train(taggeddocs, total_examples=model.corpus_count, epochs=100)


105780
201841
CPU times: user 47min 52s, sys: 25min 32s, total: 1h 13min 25s
Wall time: 48min 25s


510982312

In [137]:


cleaned_question1 = df_doc2vec['cleaned_question1']
cleaned_question2 = df_doc2vec['cleaned_question2']
duplicated_tag = df_doc2vec['is_duplicate']
df_doc2vec_org = df_doc2vec

embedding_1 = []
embedding_2 = []
cos = []
for  q1 , q2, label in zip(cleaned_question1, cleaned_question2, duplicated_tag):  
    questions = [q1, q2]
    embeddings = []
    for index, question in enumerate(questions): 
        question_token = tokenizer.tokenize(question)
        doc2vec_embed = model.infer_vector(question_token).reshape(1,-1)
        if index == 0:
            embedding_1.append(doc2vec_embed[0])
        else:
            embedding_2.append(doc2vec_embed[0])
        embeddings.append(doc2vec_embed)
    cos.append(cosine_similarity(embeddings[0],embeddings[1])[0])
    
df_doc2vec['embedding_1'] = embedding_1
df_doc2vec['embedding_2'] = embedding_2
df_doc2vec['cos'] = cos

In [138]:
df_doc2vec.to_csv("doc2vec_embeddings_without_stopwords.csv", index=False)

In [12]:
threshold_list = [0.5 , 0.6 , 0.7, 0.8, 0.9]
cosine = df_doc2vec['cos']
y_true = duplicated_tag
for threshold in threshold_list:
    is_dup_pred = []
    for cos in cosine:
        if cos > threshold:
            is_dup_pred.append(1)
        else:
            is_dup_pred.append(0)
        
    df_doc2vec['is_dup_pred'+ str(threshold)] = is_dup_pred
    y_pred = is_dup_pred
    target_names = [0, 1]
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=target_names).ravel()
    print confusion_matrix(y_true, y_pred, labels=target_names)
    accuracy = float(tn + tp)/(tn + fp + fn + tp)
    print "Accuracy: ", accuracy
    print classification_report(y_true, y_pred)

[[196725  58300]
 [ 84406  64857]]
Accuracy:  0.647018956783
             precision    recall  f1-score   support

          0       0.70      0.77      0.73    255025
          1       0.53      0.43      0.48    149263

avg / total       0.64      0.65      0.64    404288

[[226296  28729]
 [115133  34130]]
Accuracy:  0.644159608992
             precision    recall  f1-score   support

          0       0.66      0.89      0.76    255025
          1       0.54      0.23      0.32    149263

avg / total       0.62      0.64      0.60    404288

[[246387   8638]
 [138971  10292]]
Accuracy:  0.634891463511
             precision    recall  f1-score   support

          0       0.64      0.97      0.77    255025
          1       0.54      0.07      0.12    149263

avg / total       0.60      0.63      0.53    404288

[[254007   1018]
 [148125   1138]]
Accuracy:  0.631097138673
             precision    recall  f1-score   support

          0       0.63      1.00      0.77    255025
    

In [141]:
embedding1 = df_doc2vec['embedding_1']
embedding2 = df_doc2vec['embedding_2']
train_label = duplicated_tag
train_questions = embedding1 + embedding2

In [142]:
def naive_bayer(x_train_matrix, x_test_matrix, y_train, y_test):    #NAIVE BAYERS MODEL
                                                                    
    nb = GaussianNB()
    nb.fit(x_train_matrix, y_train)                                 #train the model
    y_pred = nb.predict(x_test_matrix)                              #make predictions for X_test
    print ("Naive Byers Model score: " + str(accuracy_score(y_test, y_pred)))         
    print ("Naive Byers Model confusion matrix:")
    print (confusion_matrix(y_test, y_pred))

def logistic_regression(x_train_matrix, x_test_matrix, y_train, y_test): 
    logReg = LogisticRegression(C=0.85)
    logReg.fit(x_train_matrix, y_train)
    y_pred_log = logReg.predict(x_test_matrix)
    print ("Logistic Regression score: "+ str(accuracy_score(y_test, y_pred_log)))
    print ("Logistic Regression confusion matrix:")
    print (confusion_matrix(y_test, y_pred_log))

def linear_svm(x_train_matrix, x_test_matrix, y_train, y_test):
    svm1 = LinearSVC(C=1)
    svm1.fit(x_train_matrix, y_train)
    y_pred_svc = svm1.predict(x_test_matrix)
    print ("SVC score: " + str(accuracy_score(y_test, y_pred_svc)))
    print ("SVC confusionmatrix:")
    print (confusion_matrix(y_test, y_pred_svc))

In [143]:
start_time = time.time()


train_question_label = zip(train_questions,train_label) 
train_df = pd.DataFrame(train_question_label, columns=['embedding', 'label'])
X = train_df.embedding
Y = train_df.label

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=1)

preprocessing_time = time.time()
print("--- start -> preprocessing:%s seconds ---" % (preprocessing_time - start_time))

X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
naive_bayer(X_train, X_test, Y_train, Y_test)
naive_bayer_time = time.time() 
print("--- preprocessing -> naive bayer :%s seconds ---" % (naive_bayer_time - preprocessing_time))

print ("-------------------------------------------")

logistic_regression(X_train, X_test, Y_train, Y_test)
logistic_regression_time = time.time()
print("--- niave bayer -> logistic regression :%s seconds ---" % (logistic_regression_time - naive_bayer_time))

print ("-------------------------------------------")

linear_svm(X_train, X_test, Y_train, Y_test)
linear_svm_time = time.time()
print("--- logistic regression -> linear svm :%s seconds ---" % (linear_svm_time - logistic_regression_time))
print("--- TOTAL TIME: %s seconds ---" % (time.time() - start_time))

--- start -> preprocessing:0.319641113281 seconds ---
Naive Byers Model score: 0.561480924489
Naive Byers Model confusion matrix:
[[31824 31940]
 [12382 24926]]
--- preprocessing -> naive bayer :1.08264183998 seconds ---
-------------------------------------------
Logistic Regression score: 0.634824679436
Logistic Regression confusion matrix:
[[59076  4688]
 [32221  5087]]
--- niave bayer -> logistic regression :3.42052698135 seconds ---
-------------------------------------------
SVC score: 0.63494340668
SVC confusionmatrix:
[[59574  4190]
 [32707  4601]]
--- logistic regression -> linear svm :176.277682066 seconds ---
--- TOTAL TIME: 181.100629091 seconds ---
