## Generate the FAISS Database(s) that can be saved, distributed via GDrive, and loaded by an arbitrary person

Steps:
1. Load the Chapters for the Series we're adding.
2. Load the Database if it exists, else create it
3. Load the files into the database and update their embeddings.
4. Save out the database and config.
5. Repeat for all Series.

In [1]:
MODEL_NAME = "TheSpaceManG/wildbow-distilbert" # The name of model to create embeddings with. IDeally should pull from HF
SERIES_FILENAMES = {
    "Pale": "../chapter_fmt_list.pkl",
    "Pact": "../pact_fmt_list.pkl",
    "Pate": "../pate_fmt_list.pkl",
    "Poke": "../poke_fmt_list.pkl",
    "Worm": "../worm_fmt_list.pkl",
    "Glow-Worm": "../glowworm_fmt_list.pkl",
    "Ward": "../ward_fmt_list.pkl",
    "Twig": "../twig_fmt_list.pkl"
} # filepath of the file / documents to tune on. Relative path to the /finetune/ directory this code executes in
DOCUMENT_STORE_NAME_OUT = "wildbow"

In [2]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 36.0 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Downloading farm_haystack-1.14.0-py3-none-any.whl (640 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 640.4/640.4 kB 7.1 MB/s eta 0:00:00
Collecting quantulum3
  Downloading quantulum3-0.8.1-py3-none-any.whl (10.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.7/10.7 MB 77.5 MB/s eta 0:00:00
Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing m



In [9]:
import os
print(os.getcwd())
os.chdir('./drive/MyDrive/pale-companion-files/db-gen/')
print(os.getcwd())

/content
/content/drive/MyDrive/pale-companion-files/db-gen


In [3]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [4]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch


In [5]:
import time
time.sleep(30)

In [6]:
import os
import pickle
import logging
import time
from haystack.document_stores import ElasticsearchDocumentStore
from haystack import Document
from haystack.nodes import PreProcessor, EmbeddingRetriever, Seq2SeqGenerator, TransformersSummarizer, FARMReader
from haystack.pipelines import GenerativeQAPipeline, ExtractiveQAPipeline, SearchSummarizationPipeline

In [7]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [10]:
all_chapters = []
for series_name, fname in SERIES_FILENAMES.items():
  with open(fname,'rb') as f: 
    chapters = pickle.load(f)
  [c['meta'].update({'series':series_name}) for c in chapters]
  print(chapters[0]['meta'])
  all_chapters.extend(chapters)

fmt_chapters = [Document.from_dict(d) for d in all_chapters]
len(fmt_chapters)


{'arc_title': 'Blood Runs Cold', 'pov': 'Louise', 'wordcount': '7174', 'series_chapter_number': 1, 'arc_number': '0', 'extra_material': False, 'title': 'Blood Runs Cold - 0.0', 'chapter': '0.0', 'series': 'Pale'}
{'arc_title': 'Bonds', 'series_chapter_number': 1, 'arc_number': '1', 'pov': 'Chapter', 'title': 'Bonds - 1.1', 'chapter': '1.1', 'series': 'Pact'}
{'arc_title': 'Pate', 'series_chapter_number': 1, 'arc_number': 1, 'pov': 'N/A', 'title': 'Pate', 'chapter': '1.1', 'series': 'Pate'}
{'arc_title': 'Poke', 'series_chapter_number': 1, 'arc_number': 1, 'pov': 'N/A', 'title': '1 Poke', 'chapter': '1.1', 'series': 'Poke'}
{'arc_title': 'Gestation', 'series_chapter_number': 1, 'arc_number': '1', 'pov': 'Taylor', 'title': 'Gestation - 1.1', 'chapter': '1.1', 'series': 'Worm'}
{'arc_title': 'P', 'series_chapter_number': 1, 'arc_number': 'P', 'pov': 'Chapter', 'title': 'P - P.1', 'chapter': 'p.1', 'series': 'Glow-Worm'}
{'arc_title': 'Daybreak', 'series_chapter_number': 1, 'arc_number': '

1328

In [11]:
preprocessor = PreProcessor(
    split_by='word',
    split_length=200,
    split_overlap=20,

    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_respect_sentence_boundary=True,
    progress_bar=True, 
    add_page_number=True
)
docs = preprocessor.process(fmt_chapters)
print(f"We will be working with {len(docs)} documents from {len(fmt_chapters)} chapters")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocessing:   0%|          | 0/1328 [00:00<?, ?docs/s]



We will be working with 56880 documents from 1328 chapters


In [8]:
# TODO Modify this so that it can resume from an existing db
# document_store = FAISSDocumentStore(
#     faiss_index_factory_str="Flat", 
#     similarity="cosine",
#     sql_url = f"sqlite:///{DOCUMENT_STORE_NAME_OUT}_sqldb.db",
#     # faiss_index = "pale.db",
#     )

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index=DOCUMENT_STORE_NAME_OUT+"_es",
    duplicate_documents='skip',
    similarity='cosine'
)

# document_store.write_documents(docs,batch_size=2500)

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable  HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry


In [12]:
document_store.write_documents(docs,batch_size=2500)

In [13]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model=MODEL_NAME,
    model_format="sentence_transformers",
    max_seq_len=500,
    progress_bar=True,
)


document_store.update_embeddings(retriever,update_existing_embeddings=False, batch_size=2500)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model TheSpaceManG/wildbow-distilbert


Downloading (…)2f043/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8f53f2f043/README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading (…)53f2f043/config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)2f043/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading (…)8f53f2f043/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)3f2f043/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

INFO:haystack.document_stores.search_engine:Updating embeddings for all 56880 docs without embeddings...


Updating embeddings:   0%|          | 0/56880 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

In [None]:
# document_store.save(
#     index_path=f'{DOCUMENT_STORE_NAME_OUT}.index',
#     config_path=f'{DOCUMENT_STORE_NAME_OUT}.json'
# )

In [14]:
document_store.get_embedding_count() # Should be the same length as our docs

56880

In [None]:
# new_doc_store = FAISSDocumentStore.load(
#     index_path=f'{DOCUMENT_STORE_NAME_OUT}.index',
#     config_path=f'{DOCUMENT_STORE_NAME_OUT}.json'
#     )
# new_doc_store.get_embedding_count() # Should be the same as above

43561