A brief explanation of sentence embeddings https://stackoverflow.com/questions/59877385/what-is-the-difference-between-sentence-encodings-and-contextualized-word-embedd

Current issues:
1. I am not removing duplicate questions when parsing over the dataset. 
2. need to project the embeddings somehow

## Importing libraries, defining functions and loading *data*

In [None]:
!pip install transformers
!pip install tensorboard
!pip install sentence_transformers
import pandas as pd
import numpy as np
import torch
import pandas as pd
from time import time
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import tensorflow_hub as hub # For USE
from sentence_transformers import SentenceTransformer
import os
import tensorflow as tf
import datetime
from tensorboard.plugins import projector


def generate_embeddings(model_name, data_ls):
    if model_name == "USE":
        print("Check 1")
        embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        print("Check 2")
        embeddings = embed(data_ls)
        print("Check 3")
        embeddings = embeddings.numpy()
        print("Check 4")
        return embeddings

    elif model_name == "all-MiniLM":
        # All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.
        model = SentenceTransformer('all-MiniLM-L12-v2')
        embeddings = model.encode(data_ls)
        return embeddings

    elif model_name == "paraphrase-MiniLM-L6-v2":
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        embeddings = model.encode(data_ls)
        return embeddings

    elif model_name == "multi-qa-MiniLM-L6-cos-v1":
        # This model was tuned for semantic search: Given a query/question, if can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.
        model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
        embeddings = model.encode(data_ls)
        return embeddings

    elif model_name == "multi-qa-mpnet-base-dot-v1":
        # This model was tuned for semantic search: Given a query/question, if can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.
        model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
        embeddings = model.encode(data_ls)
        return embeddings
    elif model_name == "all-mpnet-base-v2":
        # All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.
        model = SentenceTransformer('all-mpnet-base-v2')
        embeddings = model.encode(data_ls)
        return embeddings

    elif model_name == "paraphrase-albert-small-V2":
        model = SentenceTransformer('paraphrase-albert-small-v2')
        embeddings = model.encode(data_ls)
        return embeddings

    elif model_name == "all-distilroberta-v1":
        model = SentenceTransformer('all-distilroberta-v1')
        embeddings = model.encode(data_ls)
        return embeddings

    else:
        print("sorry")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip "drive/MyDrive/Quora-QnA/quora-question-pairs" -d "quora-question-pairs"
!unzip "quora-question-pairs/train.csv.zip"
!unzip "quora-question-pairs/test.csv.zip"


Archive:  drive/MyDrive/Quora-QnA/quora-question-pairs.zip
replace quora-question-pairs/sample_submission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Generate_embeddings

In [None]:
df = pd.read_csv("train.csv", index_col=0)


In [None]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
count = 0
sentences = []
for i in df['question1']:
  if  type(i) != type("") or len(i) <= 0 :
    count+=1
  else:
    sentences.append(i)

for i in df['question2']:
  if  type(i) != type("") or len(i) <= 0 :
    count+=1
  else:
    sentences.append(i)

print(count, " sentences removed.")

3  sentences removed.


In [None]:
# sentences = sentences[:100]

In [None]:
t =time()
embeds = generate_embeddings("all-mpnet-base-v2", sentences)
print("Time taken: " , int((time() - t)%60) ,"s, ", int((time() - t)/60), "m")

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
print("Done")

In [None]:
print(len(embeds))
print(len(sentences))

In [None]:
# embedsDf = pd.DataFrame(embeds)
# embedsDf.insert(0, 'sentences', sentences)
# embedsDf.head()

In [None]:
# from google.colab import files

# embedsDf.to_csv("QQD-sentence-embeddings.tsv",  sep='\t', index=None, header=None , encoding = 'utf-8-sig') 
# files.download("QQD-sentence-embeddings.tsv")

In [None]:
# df.to_csv('metadata.tsv', index=False, sep='\t')

## Projecting the embeddings

##### Gaurav sir's notebooks

In [None]:
# def register_embedding(embedding_tensor_name, meta_data_fname, log_dir):
#     config = projector.ProjectorConfig()
#     embedding = config.embeddings.add()
#     embedding.tensor_name = embedding_tensor_name
#     embedding.metadata_path = meta_data_fname
#     projector.visualize_embeddings(log_dir, config)

# def get_tensor_data(embeddings, texts):
#     x = embeddings#
#     # printx
#     y = np.array(texts)
#     print(len(y))
#     return x, y


# def save_labels_tsv(labels, filepath, log_dir):
#     with open(os.path.join(log_dir, filepath), 'w') as f:
#         for label in labels:
#             f.write('{}\n'.format(label))

# def tensor_projector(clean_text_ls, emb_vec_np):
#     LOG_DIR = '/Users/extramarks/PycharmProjects/pythonProject/PARAPHRASE/'  # Tensorboard log dir

#     if not os.path.exists(LOG_DIR):
#       os.mkdir(LOG_DIR)

#     META_DATA_FNAME = 'meta.tsv'  # Labels will be stored here
#     EMBEDDINGS_TENSOR_NAME = 'embeddings'
#     EMBEDDINGS_FPATH = os.path.join(LOG_DIR, EMBEDDINGS_TENSOR_NAME + '.ckpt')
#     STEP = 0

#     x, y = get_tensor_data(emb_vec_np, clean_text_ls)
#     register_embedding(EMBEDDINGS_TENSOR_NAME, META_DATA_FNAME, LOG_DIR)
#     save_labels_tsv(y, META_DATA_FNAME, LOG_DIR)

#     # Size of files created on disk: 80.5kB
#     tensor_embeddings = tf.Variable(x, name=EMBEDDINGS_TENSOR_NAME)
#     saver = tf.compat.v1.train.Saver([tensor_embeddings])  # Must pass list or dict
#     saver.save(sess=None, global_step=STEP, save_path=EMBEDDINGS_FPATH)

#     # from tensorboard import program
#     #
#     tracking_address = "/Users/extramarks/PycharmProjects/pythonProject/PARAPHRASE"  # the path of your log file.
#     #
#     # tb = program.TensorBoard()
#     # tb.configure(argv=[None, '--logdir', tracking_address])
#     # url = tb.launch()
#     # print(f"Tensorflow listening on {url}")
#     os.system('tensorboard --logdir=' + tracking_address)
#     print("Check tb_projection")
#     return True

In [None]:
# tensor_projector(sentences, embeds)

##### Tensorflow docs

In [None]:
# %reload_ext tensorboard
# !rm -rf /logs/

In [None]:
# log_dir='/logs/sentence-encodings-qqd/'
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
# with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
#   for sentence in sentences:
#     f.write("{}\n".format(sentence))
# checkpoint = tf.train.Checkpoint(embedding=tf.Variable(embeds))
# checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))
# config = projector.ProjectorConfig()
# embedding = config.embeddings.add()
# embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
# embedding.metadata_path = 'metadata.tsv'
# projector.visualize_embeddings(log_dir, config)

In [None]:
%tensorboard --logdir /logs/imdb-example/

##### Amitness Blog

In [None]:
embedding_df = pd.DataFrame(embeds)
path = '/content/drive/MyDrive/Literature Review/Week2/QQD Sentence Embeddings Full/output.tsv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  embedding_df.to_csv(f, index=False, sep='\t')

In [None]:
sentences_df = pd.DataFrame(sentences)
path = '/content/drive/MyDrive/Literature Review/Week2/QQD Sentence Embeddings Full/metadata.tsv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  sentences_df.to_csv(f, index=False, sep='\t')

## Conclusion

For getting the generated tsv files, visit https://drive.google.com/drive/folders/1mgCnxX-VKux4t9Cs5EUVQ2uFfZWrppcR?usp=sharing and then use these files to load data into tensorflow projector

Visit the following link to see an example fo close sentences as shwoin in the Tensorflow **projector**
https://drive.google.com/file/d/1QZCYXYkpqlTYCwgEuKvAWZ7vSI808oY2/view?usp=sharing

In [None]:
sentences[6796]

In [None]:
sentences[3718]