In [524]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
import datetime, time, json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint, History
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [3]:
wordlist = np.load('/Users/zhang/MscProject_tweak2vec/corpus/quora_vocab5.npy').tolist()
wordlist.append(['UNK',0])
word2idx = {w[0]: wordlist.index(w) for w in wordlist }
idx2word = {wordlist.index(w): w[0] for w in wordlist }

In [284]:
embed_reg = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_pivotsfull_alpha10_5m.npy')
embed_bas = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_quora_5m.npy')
embed_pre = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_quoragoogle_50d.npy')

### search nearest neighbours

In [5]:
def K_neighbour(embed_set, word, k):
    tree = KDTree(embed_set, leaf_size=100)
    vec = embed_set[word2idx[word]]
    dists, inds = tree.query([vec], k)
    inds = inds.reshape(k,)
    near_words = [idx2word[i] for i in list(inds)]
    return near_words

In [230]:
w = 'history'
near_words_seg = K_neighbour(embed_seg, w, 6)
near_words_bas = K_neighbour(embed_bas, w, 6)
near_words_pre = K_neighbour(embed_pre, w, 6)
print('SEG-----',near_words_seg)
print('BAS-----',near_words_bas)
print('PRE-----',near_words_pre)

SEG----- ['history', 'today', 'part', 'most', 'significant', 'achievements']
BAS----- ['history', 'events', 'modern', 'relevant', 'recent', 'involved']
PRE----- ['history', 'most', 'century', 'one', 'decade', 'decades']


### compute cosin similarity

### get test prediction

In [7]:
def Max_BoE(word_embedding):

    question1 = Input(shape=(max_sentence_len,))
    question2 = Input(shape=(max_sentence_len,))



    q1 = Embedding(  input_dim=vocab_size, 
                     output_dim=embed_dim, 
                     weights=[word_embedding], 
                     input_length=max_sentence_len, 
                     trainable=False)(question1)
    q1 = TimeDistributed(Dense(embed_dim, activation='relu'))(q1)
    q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q1)

    q2 = Embedding(  input_dim=vocab_size, 
                     output_dim=embed_dim, 
                     weights=[word_embedding], 
                     input_length=max_sentence_len, 
                     trainable=False)(question2)
    q2 = TimeDistributed(Dense(embed_dim, activation='relu'))(q2)
    q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embed_dim, ))(q2)

    merged = concatenate([q1,q2])
    merged = Dense(200, activation='relu')(merged)
    merged = Dropout(dropout_rate)(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(200, activation='relu')(merged)
    merged = Dropout(dropout_rate)(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(200, activation='relu')(merged)
    merged = Dropout(dropout_rate)(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(200, activation='relu')(merged)
    merged = Dropout(dropout_rate)(merged)
    merged = BatchNormalization()(merged)

    is_duplicate = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[question1,question2], outputs=is_duplicate)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [35]:
current_embed = {}
current_embed['REG'] = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_cat_reg_5m.npy")
current_embed['BAS'] = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_quora_5m.npy")
current_embed['PRE'] = np.load("/Users/zhang/MscProject_tweak2vec/word2vecModel/quora/tokens/w2v_quoragoogle_50d.npy")

In [9]:
quora_corpus = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_corpus_int5.npy")
labels = np.load("/Users/zhang/MscProject_tweak2vec/corpus/quora_labels.npy")
question1 = []
question2 = []
for n in range(int(len(quora_corpus)/2)):
    question1.append(quora_corpus[2*n])
    question2.append(quora_corpus[2*n+1])
    
# hyperparameter setup
max_sentence_len = 25
embed_dim = 50
dropout_rate = 0.1
vocab_size = len(current_embed)
    
q1_data = pad_sequences(question1, maxlen=25)
q2_data = pad_sequences(question2, maxlen=25)
questions = np.stack((q1_data, q2_data), axis=1)
X_train, X_test, y_train, y_test = train_test_split(questions, labels, test_size=0.1, random_state=2018)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [36]:
weight_file = {}

weight_file['BAS'] = '/Users/zhang/MscProject_tweak2vec/Max_BOE_weights/quora_5m_weights.h5'
weight_file['REG'] = '/Users/zhang/MscProject_tweak2vec/Max_BOE_weights/reg_5m_weights.h5'
weight_file['PRE'] = '/Users/zhang/MscProject_tweak2vec/Max_BOE_weights/pretrain_50d_weights.h5'

In [540]:
def return_acc(w_file, embed):
    MODEL_WEIGHTS_FILE = w_file
    model = Max_BoE(embed)
    model.load_weights(MODEL_WEIGHTS_FILE)
    loss, accuracy = model.evaluate([Q1_test[:10000], Q2_test[:10000]], y_test[:10000], verbose=0)
    pred_y = model.predict([Q1_test[:10000], Q2_test[:10000]], verbose=0)
    pred_y = pred_y.reshape(len(pred_y),)
    #pred_y = [1 if i <0.5 else 0 for i in pred_y]
    acc_y = np.abs(pred_y -y_test[:10000])
    acc = np.array([1 if i < 0.5 else 0 for i in acc_y ]).reshape(len(acc_y),1)
    return acc
    

In [541]:
acc_list = {}
pred_list = {}
acc_list['REG'] = return_acc(weight_file['REG'], current_embed['REG'])
acc_list['BAS'] = return_acc(weight_file['BAS'], current_embed['BAS'])
acc_list['PRE'] = return_acc(weight_file['PRE'], current_embed['PRE'])

In [534]:
confusion_matrix(y_test, y_test, labels=[0,1])

array([[25495,     0],
       [    0, 14934]])

In [543]:
acc_cat = np.concatenate((acc_list['REG'], acc_list['BAS'], acc_list['PRE']),axis=1)

In [651]:
a=0

for i in range(len(acc_cat)):
    q1 = [idx2word[w] if w!=0 else '' for w in Q1_test[i]]  
    q2 = [idx2word[w] if w!=0 else '' for w in Q2_test[i]]
    
    q1 = ' '.join(q1)
    q2 = ' '.join(q2)

    
    if np.array_equal(acc_cat[i],[1,0,0]):
        
        print('indix:{0},results:{1}'.format(i,acc_cat[i]))
        print('question1:',q1)
        print('question2:',q2)


indix:85,results:[1 0 0]
question1:              who are top fea service providers provide accurate handling analysis services globally
question2:              who are top fea service providers provide accurate modal analysis services globally
indix:154,results:[1 0 0]
question1:                      why reddit better quora
question2:                      site better reddit quora
indix:185,results:[1 0 0]
question1:               why do we have stage fright how do we overcome it
question2:                     best way overcome stage fear
indix:214,results:[1 0 0]
question1:                    my dad cheats should  do
question2:               recently found my dad cheating on my mom should  do
indix:226,results:[1 0 0]
question1:                     best way advertise your business
question2:                     best way advertise business offline
indix:255,results:[1 0 0]
question1:                    cooler better violin piano your opinion
question2:            in your opinion better 

question1:                     are best mechanical engineering books
question2:                  are some interesting books mechanical engineer should read
indix:3173,results:[1 0 0]
question1:                      autocad still new outdated
question2:                        autocad outdated
indix:3193,results:[1 0 0]
question1:                      ethical hacking certification free
question2:                     are certification courses ethical hacking
indix:3276,results:[1 0 0]
question1:                   how do you define success in life
question2:                     how does one define success
indix:3277,results:[1 0 0]
question1:                    best way avoid procrastinating on internet
question2:                     how do  avoid procrastinating
indix:3280,results:[1 0 0]
question1:                      importance mitochondria in cell
question2:                      mitochondria are main functions
indix:3300,results:[1 0 0]
question1:                 why we do study compu

question2:                 are 7th chords more commonly played major 7th chords
indix:9509,results:[1 0 0]
question1:             can  make 30 million dollars day trading  start with 500 000
question2:            can  make 30 million dollars day trading  start with 500 000 how
indix:9530,results:[1 0 0]
question1:                    are facials good bad your skin
question2:                   are cosmetic facials actually good your skin
indix:9700,results:[1 0 0]
question1:                    are renewable resources are some examples
question2:                     are examples nonrenewable renewable resources
indix:9720,results:[1 0 0]
question1:               would happen you hired two private detectives spy on each other
question2:                will happen  hire two private detectives follow each other
indix:9805,results:[1 0 0]
question1:                  are best resources learn programming at its core
question2:                     best resource learn c programming
indix:9861,res

In [223]:
train_pairs = pd.read_csv('/Users/zhang/MscProject_tweak2vec/QuoraQuestionPairs/train.csv',encoding='ISO-8859-1')
pd.options.display.max_colwidth=200

In [617]:
where1 = np.where(train_pairs['question1'].str.contains('sugar'), 1, 0)
where2 = np.where(train_pairs['question1'].str.contains('you'), 1, 0)
ids = []
for i in range(len(where1)):
    if where1[i] == 1 and where2[i] == 1:
            ids.append(i)
train_pairs.loc[ids]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
316,316,632,633,"What does this saying mean; ""Don't trust everything you see even salt looks like sugar""?","People say ""don't try to please others."" Does being nice to others mean pleasing them?",0
8936,8936,17392,17393,Can you substitue brown sugar with white sugar?,Can I substitute white sugar for brown?,1
46063,46063,82437,82438,What would happen if you cut out refined sugar entirely?,How can I get accutane in London and how expensive is it when using a private doctor?,0
49806,49806,88602,88603,Does consuming sugar lower your immune system?,How does consuming too much sugar on a daily basis weaken your immune system?,1
50075,50075,89025,89026,How can mouthwash be sweet? Isn't sugar bad for your teeth?,Is spaghetti bad for teeth?,0
57474,57474,100990,100991,Does eating too much sugar give you diabetes? I actually do work out and eat healthy for the most part but I'm wondering because I eat a lot of sugar,Does eating too much sugar actually give you an increased risk for diabetes?,1
88182,88182,148364,148365,Why is sugar bad for you?,Why sugar is bad for health?,1
99279,99279,88602,164840,Does consuming sugar lower your immune system?,Does sugar suppress the effectiveness of the immune system?,1
106283,106283,175111,175112,How can you find a sugar daddy?,How do I acquire a sugar daddy?,1
200736,200736,148364,292983,Why is sugar bad for you?,Why is sugar bad for us?,1


In [650]:
w1 = 'obama'
w2 = 'putin'
vec1 = embed_reg[word2idx[w1]].reshape(-1,1)
vec2 = embed_reg[word2idx[w2]].reshape(-1,1)
def cosin_distance(vector1, vector2):
    dot_product = 0.0
    normA = 0.0
    normB = 0.0
    for a, b in zip(vector1, vector2):
        dot_product += a * b
        normA += a ** 2
        normB += b ** 2
    if normA == 0.0 or normB == 0.0:
        return None
    else:
        return np.abs(dot_product / ((normA * normB) ** 0.5))
cosin_distance(vec1, vec2)

array([0.72424675])