In [None]:
import pandas as pd

corpus = pd.read_csv('quotes_classical_clean.csv', sep='|')

In [None]:
import datasets

corpus_dataset = datasets.Dataset.from_pandas(corpus)

In [None]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = encoder.encode(
    corpus_dataset["quote"],
    batch_size=100,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

In [None]:
import numpy as np
np.save('embeddings_numpy', embeddings)

In [None]:
dataset_embeddings = datasets.Dataset.from_dict({"embeddings": embeddings})
dataset_embeddings.add_faiss_index(column="embeddings")

In [None]:
import pickle

with open('embeddings_with_faiss.pickle', 'wb') as pkl:
    pickle.dump(dataset_embeddings, pkl)

In [None]:
dset_embed = datasets.Dataset.from_dict({"embeddings": embeddings})
embeddings_dataset = datasets.concatenate_datasets([corpus_dataset, dset_embed], axis=1)
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
embeddings_dataset.save_faiss_index('embeddings', 'index_alone.faiss')

In [None]:
import faiss

index = faiss.read_index('index_alone.faiss')

In [None]:
with open('complete_embeddings_with_faiss.pickle', 'wb') as pkl:
    pickle.dump(embeddings_dataset, pkl)

In [None]:
with open('quotes_alone.pickle', 'wb') as pkl:
    pickle.dump(corpus_dataset, pkl)

In [None]:
sentence = "Knowledge of history is power."


In [None]:
sentence_embedding = encoder.encode([sentence])
scores, samples = index.search(
    sentence_embedding, k=10
)
samples

In [None]:
np.array(embeddings_dataset['quote'])[[9646, 16047,  4890,  4894,  4885,  7535,  9626,  4905,  5076,
         4904]]

In [None]:
import pickle

with open('model.pickle', 'wb') as pkl:
    pickle.dump(encoder, pkl)
with open('embeddings_dataset.pickle', 'wb') as pkl:
    pickle.dump(embeddings_dataset, pkl)

# Tests

In [None]:
import numpy as np
import os
import pickle
import torch
from sentence_transformers import util



model = pickle.load(open("model.pickle", "rb"))
embeddings_dataset = pickle.load(open("embeddings_dataset.pickle", "rb"))
authors = np.unique(embeddings_dataset["author"])
faiss_index = embeddings_dataset.get_index("embeddings").faiss_index


In [None]:
with open('index_only.pickle', 'wb') as pkl:
    pickle.dump(faiss_index, pkl)

In [None]:
embeddings_dataset.drop_index('embeddings')

In [None]:
embeddings_numpy = np.array(embeddings_dataset["embeddings"]).astype(np.float32)

In [None]:
with open('embeddings_numpy.pickle', 'wb') as pkl:
    pickle.dump(embeddings_numpy, pkl)

In [None]:
sentence_embedding = model.encode([sentence])

In [None]:
author_name = 'Victor Hugo'

In [None]:
from sentence_transformers.util import semantic_search
hits = semantic_search(sentence_embedding, dataset_embeddings[author_indexes, :], top_k=5)
list_hits = [author_indexes[i['corpus_id']] for i in hits[0]]
embeddings_dataset.select([12676, 4967, 2612, 8884, 4797])

# Export

In [None]:
import psycopg2

conn = psycopg2.connect(
    database="railway",
    user="postgres",
    password="pG7J4I1bmcHuTXmNVeFC",
    host="containers-us-west-127.railway.app",
    port="5800",
)


conn.autocommit = True
cursor = conn.cursor()

In [None]:
sql = '''CREATE TABLE DETAILS(employee_id int NOT NULL,\
employee_name char(20),\
employee_email varchar(30), employee_salary float);'''
  
cursor.execute(sql)

sql2 = '''COPY details(employee_id,employee_name,\
employee_email,employee_salary)
FROM '/private/tmp/details.csv'
DELIMITER ','
CSV HEADER;'''
  
cursor.execute(sql2)
  
sql3 = '''select * from details;'''
cursor.execute(sql3)
for i in cursor.fetchall():
    print(i)
  
conn.commit()
conn.close()

# End of notebook