In [2]:
%load_ext autoreload
%autoreload 2

import dill
import numpy as np
import os
import pandas as pd
import pickle

import torch

import cluster_analysis
import siamese
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Very Small Dataset

In [3]:
sentence_pairs = pd.read_csv(os.path.join("..", "src", "data", "paired_sentences.csv"), sep = ",")
sentence_pairs.head()

Unnamed: 0,text1,text2
0,"works fine, as expected",works great and cannot beat the price!
1,"works fine, as expected",worked very good for what i needed and the pri...
2,"works, great, shipped fast, great price.",this cable works perfectly and at a great price!
3,this cable is awesome and does the required jo...,"excellent product, no complains, good price, w..."
4,so far it is working great.,so far this item is great.


In [4]:
# text1 + text2
sentences = list(sentence_pairs["text1"]) + list(sentence_pairs["text2"])

In [5]:
sentence_sets = utils.create_sentence_set(sentences)
sentence_tokenized = utils.create_spacy_text(utils.create_spacy_docs(sentences))

## Load Model & Vocab

In [6]:
src_vocab = torch.load("src_vocab.pt")
trg_vocab = torch.load("trg_vocab.pt")

In [7]:
model_path = "electronics_autoencoder_epoch7.pt"
autoencoder = torch.load(model_path, map_location="cpu")

In [8]:
siamese_model_path = "siamese_ae_epoch_1"
siamese_autoencoder = torch.load(siamese_model_path, map_location="cpu")

## Find Embeddings (Small Dataset)

In [9]:
sent_idxs = utils.indexify_sentences(src_vocab, sentence_tokenized)

In [88]:
sentence_embeddings, generated_sentences = siamese.run_embeddings(siamese_autoencoder, trg_vocab, sent_idxs)

In [89]:
labels = cluster_analysis.kmeans_labels(sentence_embeddings, 4)

In [90]:
jaccard_cluster_stats = cluster_analysis.cluster_jaccard_statistics(sentence_tokenized, sentence_sets, labels, "kmeans")

In [91]:
jaccard_cluster_stats

[kmeans : jaccard: avg= 0.055556, min= 0.000000, max= 0.166667, var= 0.006173,
 kmeans : jaccard: avg= 0.006000, min= 0.000000, max= 0.052632, var= 0.000234,
 kmeans : jaccard: avg= 0.028300, min= 0.000000, max= 1.000000, var= 0.015669,
 kmeans : jaccard: avg= 0.061558, min= 0.000000, max= 1.000000, var= 0.026322]