In [1]:
import string
import types
import pandas as pd
import nltk
import numpy as np
from numpy.random import randn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.metrics import classification_report
import time
from sklearn import svm
import codecs, csv, sys
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB                                                                
from sklearn.cross_validation import train_test_split 
from sklearn.metrics import accuracy_score
import itertools
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV



In [2]:
df_train = pd.read_csv("/Volumes/Barly/NLP/Project/5k_testing/Type3/data/doc2Vec/train_doc2vec.csv")
df_test = pd.read_csv("/Volumes/Barly/NLP/Project/5k_testing/Type3/data/doc2Vec/test_doc2vec.csv")

In [3]:
start_time = time.time()

train_embedding1 = map(np.fromstring, df_train['embedding_1'],\
                                      itertools.repeat(float, df_train.shape[0]),\
                                                       itertools.repeat(100, df_train.shape[0]))

train_embedding2 = map(np.fromstring, df_train['embedding_2'],\
                                      itertools.repeat(float, df_train.shape[0]),\
                                                       itertools.repeat(100, df_train.shape[0]))

X_label = np.array(df_train['is_duplicate'].tolist())
X_train = np.concatenate((np.array(train_embedding1), np.array(train_embedding2)), axis=1)
normalize = Normalizer().fit(X_train)
X_train = normalize.transform(X_train)

test_embedding1 = map(np.fromstring, df_test['embedding_1'],\
                                      itertools.repeat(float, df_test.shape[0]),\
                                                       itertools.repeat(100, df_test.shape[0]))

test_embedding2 = map(np.fromstring, df_test['embedding_2'],\
                                      itertools.repeat(float, df_test.shape[0]),\
                                                       itertools.repeat(100, df_test.shape[0]))

Y_label = np.array(df_test['is_duplicate'].tolist())
Y_train = np.concatenate((np.array(test_embedding1), np.array(test_embedding2)), axis=1)
Y_train = normalize.transform(Y_train)

In [4]:
def naive_bayes(x_train_matrix, x_test_matrix, y_train, y_test):    #NAIVE BAYERS MODEL                                                                
    nb = MultinomialNB(alpha=2.8)
    nb.fit(x_train_matrix, y_train)                                 #train the model
    y_pred = nb.predict(x_test_matrix)                              #make predictions for X_test
    print ("Naive Bayes Model score: " + str(accuracy_score(y_test, y_pred)))         
    print ("Naive Bayes Model confusion matrix:")
    print (confusion_matrix(y_test, y_pred))
    y_predicted_proba = nb.predict_proba(x_test_matrix)
    print log_loss(y_test, y_predicted_proba)
    return y_pred

def logistic_regression(x_train_matrix, x_test_matrix, y_train, y_test): 
    logReg = LogisticRegressionCV(Cs=[0.01, 0.1, 1, 10, 100], cv=5, solver='saga', n_jobs= -1 )
    logReg.fit(x_train_matrix, y_train)
    y_pred_log = logReg.predict(x_test_matrix)
    print ("Logistic Regression score: "+ str(accuracy_score(y_test, y_pred_log)))
    print ("Logistic Regression confusion matrix:")
    print (confusion_matrix(y_test, y_pred_log))
    y_predicted_proba = logReg.predict_proba(x_test_matrix)
    print log_loss(y_test, y_predicted_proba)
    return y_pred_log

def linear_svm(x_train_matrix, x_test_matrix, y_train, y_test):
    param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}]
    svc = svm.SVC()
    svm1 = GridSearchCV(estimator=svc, param_grid=param_grid, n_jobs= -1, cv = 5) 
    svm1.fit(x_train_matrix, y_train)
    y_pred_svc = svm1.predict(x_test_matrix)
    print ("SVC score: " + str(accuracy_score(y_test, y_pred_svc)))
    print ("SVC confusionmatrix:")
    print (confusion_matrix(y_test, y_pred_svc))
    return y_pred_svc

In [5]:
preprocessing_time = time.time()
print("--- start -> preprocessing:%s seconds ---" % (preprocessing_time - start_time))


print ("\n\n Naive bayes result on doc2vec_embedding: ")
y_pred_nb = naive_bayes(X_train, Y_train, X_label, Y_label)
naive_bayes_time = time.time() 
print("--- preprocessing -> naive bayes :%s seconds ---" % (naive_bayes_time - preprocessing_time))


print ("-------------------------------------------")
print ("\n\n Logistic Regression result on doc2vec_embedding: ")
y_pred_lr = logistic_regression(X_train, Y_train, X_label, Y_label)
logistic_regression_time = time.time()
print("--- naive bayes -> logistic regression :%s seconds ---" % (logistic_regression_time - naive_bayes_time))


print ("-------------------------------------------")
print ("\n\n Linear SVM result on doc2vec_embedding: ")
y_pred_svm = linear_svm(X_train, Y_train, X_label, Y_label)
linear_svm_time = time.time()
print("--- logistic regression -> linear svm :%s seconds ---" % (linear_svm_time - logistic_regression_time))
print("--- TOTAL TIME: %s seconds ---" % (time.time() - start_time))

--- start -> preprocessing:0.538676977158 seconds ---


 Naive bayes result on doc2vec_embedding: 
Naive Bayes Model score: 0.484
Naive Bayes Model confusion matrix:
[[160  90]
 [168  82]]
0.696229682641
--- preprocessing -> naive bayes :0.0167608261108 seconds ---
-------------------------------------------


 Logistic Regression result on doc2vec_embedding: 




Logistic Regression score: 0.478
Logistic Regression confusion matrix:
[[156  94]
 [167  83]]
0.71349812543
--- naive bayes -> logistic regression :39.7936460972 seconds ---
-------------------------------------------


 Linear SVM result on doc2vec_embedding: 
SVC score: 0.494
SVC confusionmatrix:
[[161  89]
 [164  86]]
--- logistic regression -> linear svm :870.411616087 seconds ---
--- TOTAL TIME: 910.761385918 seconds ---


In [None]:
y_pred_nb = y_pred_nb.tolist()
y_pred_lr = y_pred_lr.tolist()
y_pred_svm = y_pred_svm.tolist()

In [None]:
df_test['nb_duplicate'] = y_pred_nb
df_test['lr_duplicate'] = y_pred_lr
df_test['svm_duplicate'] = y_pred_svm

In [None]:
df_test.to_csv("/Volumes/Barly/NLP/Project/5k_testing/Type3/Results/test_doc2vec_classifier.csv", index = False)