In [412]:
%load_ext autoreload
%autoreload 2

import dill
import numpy as np
import os
import pandas as pd
import pickle
import scipy.spatial.distance


import torch
from torch.autograd import Variable

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [202]:
idx_to_sents = pickle.load(open(os.path.join("data", "elec_ind_to_sents.pkl"), 'rb'))
prod_to_idx = pickle.load(open(os.path.join("data", "prod_to_ind.pkl"), 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'data\\elec_ind_to_sents.pkl'

In [204]:
sentence_pairs = pd.read_csv(os.path.join("..", "src", "data", "paired_sentences.csv"), sep = ",")

In [205]:
sentence_pairs

Unnamed: 0,text1,text2
0,"works fine, as expected",Works great and cannot beat the price!
1,"works fine, as expected",Worked very good for what i needed and the pri...
2,"Works, great, shipped fast, great price.",This cable works perfectly and at a great price!
3,This cable is awesome and does the required jo...,"Excellent product, no complains, good price, w..."
4,So far it is working great.,So far this item is great.
5,Works like a charm.,Work very good
6,"This is a great SD card, it is very fast and w...","This is a class 10 microSD card, and it works ..."
7,No problems with it.,"Quick response, no lag or compatibility issues."
8,"Nothing really to say, except for excellent me...",Holds lots of data with fast read/write capabi...
9,"This is a class 10 microSD card, and it works ...","This is a great SD card, it is very fast and w..."


## Load Vocab

In [328]:
src_vocab = torch.load("src_vocab.pt")
trg_vocab = torch.load("trg_vocab.pt")

## Load Trad. Autoencoder

In [208]:
model_path = "electronics_autoencoder_epoch7.pt"
autoencoder = torch.load(model_path, map_location="cpu")

## Load Siamese Autoencoder

In [373]:
siamese_model_path = "siamese_ae_epoch_1"
siamese_autoencoder = torch.load(siamese_model_path, map_location="cpu")

## Embed & Generate Text

In [210]:
NUM_EXAMPLES = len(sentence_pairs)
EMBEDDING_DIM = 300

In [335]:
# OpenNMT: Open-Source Toolkit for Neural Machine Translation
# https://github.com/harvardnlp/annotated-transformer
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    #print(src.shape)
    memory = model.encode(src, src_mask)[0:1, :, :]
    #print(memory.shape)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    #print(len(ys))
    return memory, ys

def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

# src is a list of indices
def generate_sentence(model, src):
    embedding, decode_idx = greedy_decode(model, src, Variable(torch.ones(1, 1, src.shape[1])),
                                          src.shape[1], start_symbol=trg_vocab.stoi["<s>"])
    
    return embedding.detach().numpy()[0, 0:1, :], " ".join([trg_vocab.itos[idx] for idx in decode_idx[0]])

In [397]:
text1_idx = []
for s in list(sentence_pairs['text1']):
    text1_idx.append(torch.LongTensor([[src_vocab.stoi['<cls>']] + [src_vocab.stoi[word] for word in s.lower().split()]]))
    
text2_idx = []
for s in list(sentence_pairs['text2']):
    text2_idx.append(torch.LongTensor([[src_vocab.stoi['<cls>']] + [src_vocab.stoi[word] for word in s.lower().split()]]))

In [398]:
ae_em1 = np.zeros((NUM_EXAMPLES, EMBEDDING_DIM))
ae_em2 = np.zeros((NUM_EXAMPLES, EMBEDDING_DIM))
ae_text1 = [""] * NUM_EXAMPLES 
ae_text2 = [""] * NUM_EXAMPLES 

sae_em1 = np.zeros((NUM_EXAMPLES, EMBEDDING_DIM))
sae_em2 = np.zeros((NUM_EXAMPLES, EMBEDDING_DIM))
sae_text1 = [""] * NUM_EXAMPLES 
sae_text2 = [""] * NUM_EXAMPLES 

In [399]:
for i, (sent1, sent2) in enumerate(zip(text1_idx, text2_idx)):
    ae_em1[i, :], ae_text1[i] = generate_sentence(autoencoder, sent1)
    ae_em2[i, :], ae_text2[i] = generate_sentence(autoencoder, sent2)
    
    sae_em1[i, :], sae_text1[i] = generate_sentence(siamese_autoencoder, sent1)
    sae_em2[i, :], sae_text2[i] = generate_sentence(siamese_autoencoder, sent2)

In [400]:
ae_generations = pd.DataFrame(data={"text1" : ae_text1,
                                    "text2" : ae_text2})
sae_generations = pd.DataFrame(data={"text1" : sae_text1,
                                     "text2" : sae_text2})

## Compare Autoencoders

In [414]:
# compare embeddings
embed_diff = np.linalg.norm(ae_em1 - sae_em1, axis = 1)
cosine_dist = [scipy.spatial.distance.cosine(ae_em1[row_idx], sae_em1[row_idx]) for row_idx in range(ae_em1.shape[0])] 

[0.05975877050345324,
 0.07046368342948561,
 0.0459657557622174,
 0.06984496421789355,
 0.050298685320323444,
 0.0473509475663324,
 0.05905873002209139,
 0.07520556659799649,
 0.04495040340761092,
 0.06859156281609291,
 0.05226503200430921,
 0.0474738738549465,
 0.05144096937983178,
 0.060098714580836265,
 0.05847989056459124,
 0.06296672660220604,
 0.052545018244673836,
 0.06444811661341143,
 0.048711909384371754,
 0.05625952098941511]

In [402]:
# compare generated sentences
for i, sent in enumerate(text1_idx):
    print("Ground Truth:")
    print(sentence_pairs.iloc[[i]]["text1"])
    print("Traditional Autoencoder:")
    print(ae_generations.iloc[[i]]["text1"])
    print("Siamese Autoencoder:")
    print(sae_generations.iloc[[i]]["text1"])

Ground Truth:
0    works fine, as expected 
Name: text1, dtype: object
Traditional Autoencoder:
0    <s> works <unk> as expected
Name: text1, dtype: object
Siamese Autoencoder:
0    <s> works <unk> as expected
Name: text1, dtype: object
Ground Truth:
1    works fine, as expected 
Name: text1, dtype: object
Traditional Autoencoder:
1    <s> works <unk> as expected
Name: text1, dtype: object
Siamese Autoencoder:
1    <s> works <unk> as expected
Name: text1, dtype: object
Ground Truth:
2    Works, great, shipped fast, great price.
Name: text1, dtype: object
Traditional Autoencoder:
2    <s> <unk> shipped <unk> great <unk> </s>
Name: text1, dtype: object
Siamese Autoencoder:
2    <s> <unk> shipped <unk> <unk> great </s>
Name: text1, dtype: object
Ground Truth:
3    This cable is awesome and does the required jo...
Name: text1, dtype: object
Traditional Autoencoder:
3    <s> this cable is awesome and does the require...
Name: text1, dtype: object
Siamese Autoencoder:
3    <s> this cable is 

In [403]:
ae_generations.head()["text1"][0]

'<s> works <unk> as expected'

In [404]:
#t1 = "<cls> my name is mouse"
t1 = "<cls> although i did a reset and put the information back it , it still was only about 1/2 the wired speed ."
generate_sentence(autoencoder, torch.LongTensor([[src_vocab.stoi[word] for word in t1.split(" ")]]))[1]
#a = torch.LongTensor([[vocab.vocab.stoi["works"], vocab.vocab.stoi["works"]]])
#len(a)
#vocab.vocab.stoi["works"]

'<s> although i did a reset and put the information back it , still it was only about 1/2 the wired speed .'

In [393]:
sent = "<cls> although i did a reset and put the information back it , it still was only about 1/2 the wired speed ."
generate_sentence(siamese_autoencoder, torch.LongTensor([[src_vocab.stoi[word] for word in sent.split(" ")]]))[1]

'<s> although i did a reset and put the information back it , it still was only about 1/2 the load speed .'

In [386]:
#sent = " <s> i am sure it would be great for tvs , unfortunately , it did not provide what i was looking for . "
#sent = "<s> works ."
generate_sentence(siamese_autoencoder, torch.LongTensor([[src_vocab.stoi[word] for word in sent.split(" ")]]))[0]

array([[ 0.327527  , -0.07893595, -0.03025211,  0.06831298, -0.04272245,
         0.3032343 , -0.05806926,  0.17110014,  0.23537478,  0.31595305,
         0.2501623 , -0.1290037 , -0.4187692 , -0.12036887, -0.17526104,
         0.09711945,  0.13879128, -0.23180503, -0.5544039 ,  0.2852068 ,
         0.13227893,  0.32887796, -0.07234141,  0.35504264, -0.12981115,
        -0.09143321,  0.25975046, -0.00717225,  0.23184444,  0.02577189,
         0.10984887, -0.24008559, -0.23390111,  0.07360381,  0.33547848,
        -0.38813397, -0.01689984,  0.15639558, -0.165597  ,  0.37081578,
        -0.03423387,  0.23196715, -0.04946449,  0.06075588, -0.03328081,
        -0.21012552,  0.02266237,  0.13535114,  0.15985373, -0.0694741 ,
         0.00838902, -0.17869434,  0.04301745, -0.15593764, -0.01497189,
         0.53302747,  0.45639804,  0.08358166, -0.21247894, -0.0152492 ,
        -0.42844784,  0.20368141,  0.32746452, -0.04071741, -0.26871657,
         0.0035231 , -0.36912322, -0.47927547, -0.2

In [388]:
generate_sentence(siamese_autoencoder, torch.LongTensor([[src_vocab.stoi[word] for word in sent.split(" ")]]))[0] - generate_sentence(autoencoder, torch.LongTensor([[src_vocab.stoi[word] for word in sent.split(" ")]]))[0]

array([[-0.06696489, -0.01360267, -0.11010815, -0.06046246, -0.01499155,
         0.13420883, -0.0139626 , -0.07353418,  0.02505079, -0.04594058,
        -0.04192559, -0.01561375, -0.04797715,  0.03600286,  0.00644682,
        -0.0350748 , -0.01240247,  0.02584046, -0.06989545,  0.08013795,
        -0.00676228,  0.1262049 ,  0.01782373, -0.14406513, -0.03961991,
         0.01467997, -0.08886129, -0.03374509, -0.04057646,  0.03435272,
        -0.01613023, -0.03020197,  0.0252299 , -0.04335134,  0.10916996,
        -0.05861974, -0.06997047, -0.02427427,  0.0672856 , -0.01678962,
         0.00091328, -0.07406348,  0.07543211, -0.12470326, -0.09766726,
        -0.07078223,  0.04226875,  0.02764156,  0.03387237, -0.03993648,
        -0.06418075, -0.13136077,  0.05601442, -0.01439732,  0.01550303,
         0.04695272,  0.00230411, -0.0363141 , -0.0106689 ,  0.01355925,
        -0.12594956,  0.06793629, -0.02647994, -0.04592808, -0.15866584,
        -0.04514493, -0.14673346,  0.00191793, -0.0

In [None]:
# TODO
# - How much to embeddings change (euclidean distance, cosine distance)
# - How much do sentences differ (n-gram overlap {1,2}) --> should be low
# - If we cluster these vectors (using K-Means or DBSCAN):
#   - Are sentences placed in different clusters (is there a signficant effect, does it depend on the hyper param.)
#   - Are clusters more semantically integrated (using WordNet, 
#                                                How BOW / Word2Vec is clustered --> more diagnostic, 
#                                                topic modelling on random permutations of the sentences in cluster,
#                                                jaccard similarity
#                                               )
# 



# Jaccard Similarity, Average of vectors for sentences as a metric(to see how close it is to clustering by Word2vec)
# Semantic Ontologies
# Experiments: how are paired vectors placed compared to each other