In [None]:
import pandas as pd

corpus = pd.read_csv('quotes_clean.csv', sep='|').sample(frac=0.01, random_state=0)

In [None]:
from datasets import Dataset

corpus_dataset = Dataset.from_pandas(corpus)
corpus_dataset

In [None]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/paraphrase-albert-small-v2"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
from tqdm.auto import tqdm

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to('cpu') for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)


embeddings_dataset = corpus_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["quote"]).detach().cpu().numpy()[0]}
)

embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
sentence = "Don't listen to the critics"
sentence_embedding = get_embeddings([sentence]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", sentence_embedding, k=10
)
samples

In [None]:
import pickle

with open('tokenizer.pickle', 'wb') as pkl:
    pickle.dump(tokenizer, pkl)
with open('model.pickle', 'wb') as pkl:
    pickle.dump(model, pkl)
with open('embeddings_dataset.pickle', 'wb') as pkl:
    pickle.dump(embeddings_dataset, pkl)

# End of notebook