In [None]:
import pandas as pd
import datasets
import numpy as np

corpus = pd.read_csv('output/quotes_export_save_28-02.csv', sep='|')
corpus_dataset = datasets.Dataset.from_pandas(corpus)

In [None]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = encoder.encode(
    corpus_dataset["quote"],
    batch_size=100,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

In [None]:
dataset_embeddings = datasets.Dataset.from_dict(
    {
        "embedding": embeddings,
        "quote": corpus_dataset["quote"],
        "author": corpus_dataset["author"],
    }
)

In [None]:
dataset_embeddings.save_to_disk('export_dataset')

In [None]:
import pickle

dataset_embeddings = dataset_embeddings.add_faiss_index(column='embedding')
dataset_embeddings = dataset_embeddings.remove_columns('embedding')
with open('output/dataset_embeddings_28-02.pickle', 'wb') as pkl:
    pickle.dump(dataset_embeddings, pkl)

# Optional: FAISS index only

In [None]:
dataset_embeddings = dataset_embeddings.add_faiss_index(column='embedding')

In [None]:
dataset_embeddings.save_faiss_index('embedding', 'output/index_28-02.faiss')

In [None]:
import faiss

index = faiss.read_index('index_alone.faiss')

# Test model

In [None]:
sentence = "Knowledge of history is power."


In [None]:
sentence_embedding = encoder.encode([sentence])
scores, samples = dataset_embeddings.get_nearest_examples('embedding',
    sentence_embedding, k=10
)
samples

In [None]:
import pickle

with open('model.pickle', 'wb') as pkl:
    pickle.dump(encoder, pkl)

# Tests

In [None]:
import numpy as np
import os
import pickle
import torch
from sentence_transformers import util



model = pickle.load(open("model.pickle", "rb"))
embeddings_dataset = pickle.load(open("embeddings_dataset.pickle", "rb"))
authors = np.unique(embeddings_dataset["author"])
faiss_index = embeddings_dataset.get_index("embeddings").faiss_index


In [None]:
with open('index_only.pickle', 'wb') as pkl:
    pickle.dump(faiss_index, pkl)

In [None]:
embeddings_dataset.drop_index('embeddings')

In [None]:
embeddings_numpy = np.array(embeddings_dataset["embeddings"]).astype(np.float32)

In [None]:
with open('embeddings_numpy.pickle', 'wb') as pkl:
    pickle.dump(embeddings_numpy, pkl)

In [None]:
sentence_embedding = model.encode([sentence])

In [None]:
author_name = 'Victor Hugo'

In [None]:
from sentence_transformers.util import semantic_search
hits = semantic_search(sentence_embedding, dataset_embeddings[author_indexes, :], top_k=5)
list_hits = [author_indexes[i['corpus_id']] for i in hits[0]]
embeddings_dataset.select([12676, 4967, 2612, 8884, 4797])

# End of notebook