## Generate the FAISS Database(s) that can be saved, distributed via GDrive, and loaded by an arbitrary person

Steps:
1. Load the Chapters for the Series we're adding.
2. Load the Database if it exists, else create it
3. Load the files into the database and update their embeddings.
4. Save out the database and config.
5. Repeat for all Series.

In [8]:
MODEL_NAME = "TheSpaceManG/wildbow-distilbert" # The name of model to create embeddings with. IDeally should pull from HF
SERIES_FILENAMES = {
    "Pale": "../../chapter_fmt_list.pkl",
    "Pact": "../../pact_fmt_list.pkl",
    "Pate": "../../pate_fmt_list.pkl",
    "Poke": "../../poke_fmt_list.pkl",
    "Worm": "../../worm_fmt_list.pkl",
    "Glow-Worm": "../../glowworm_fmt_list.pkl",
    "Ward": "../../ward_fmt_list.pkl",
    "Twig": "../../twig_fmt_list.pkl"
} # filepath of the file / documents to tune on. Relative path to the /finetune/ directory this code executes in

DB_FILES_USED = [ # List matching the keys of the above dict that 
    "Pale",
    "Pact",
    "Pate",
    "Poke",
    # "Worm",
    # "Glow-Worm",
    # "Ward",
    # "Twig"
]

DOC_LENGTH = 150


DOCUMENT_STORE_NAME_OUT = f"otherverse_{DOC_LENGTH}"

In [2]:
!pip install "faiss-gpu>=1.6.3,<2"
!pip install "sqlalchemy <2"
!pip install "farm-haystack==1.14.0"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu<2,>=1.6.3
  Downloading faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sqlalchemy<2
  Downloading SQLAlchemy-1.4.47-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sqlalchemy
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 2.0.9
    Uninstalling SQLAlchemy-2.0.9:
      Successfully uninst

In [3]:
import os
import pickle
import logging
import time
from haystack.document_stores import FAISSDocumentStore
from haystack import Document
from haystack.nodes import PreProcessor, EmbeddingRetriever, Seq2SeqGenerator, TransformersSummarizer, FARMReader
from haystack.pipelines import GenerativeQAPipeline, ExtractiveQAPipeline, SearchSummarizationPipeline

In [4]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [5]:
print(os.getcwd())
os.chdir(f'./drive/MyDrive/pale-companion-files/db-gen/{DOC_LENGTH}')
print(os.getcwd())

/content
/content/drive/MyDrive/pale-companion-files/db-gen/150


In [6]:
all_chapters = []
for series_name, fname in SERIES_FILENAMES.items():
  if series_name not in DB_FILES_USED:
    print(f"Not appending {series_name}")
    continue
  with open(fname,'rb') as f: 
    chapters = pickle.load(f)
  [c['meta'].update({'series':series_name}) for c in chapters]
  print(chapters[0]['meta'])
  all_chapters.extend(chapters)

fmt_chapters = [Document.from_dict(d) for d in all_chapters]
len(fmt_chapters)


{'arc_title': 'Blood Runs Cold', 'pov': 'Louise', 'wordcount': '7174', 'series_chapter_number': 1, 'arc_number': '0', 'extra_material': False, 'title': 'Blood Runs Cold - 0.0', 'chapter': '0.0', 'series': 'Pale'}
{'arc_title': 'Bonds', 'series_chapter_number': 1, 'arc_number': '1', 'pov': 'Chapter', 'title': 'Bonds - 1.1', 'chapter': '1.1', 'series': 'Pact'}
{'arc_title': 'Pate', 'series_chapter_number': 1, 'arc_number': 1, 'pov': 'N/A', 'title': 'Pate', 'chapter': '1.1', 'series': 'Pate'}
{'arc_title': 'Poke', 'series_chapter_number': 1, 'arc_number': 1, 'pov': 'N/A', 'title': '1 Poke', 'chapter': '1.1', 'series': 'Poke'}
Not appending Worm
Not appending Glow-Worm
Not appending Ward
Not appending Twig


465

In [9]:
DOC_LENGTHS = [150,250,400] if DOC_LENGTH=='mixed' else [DOC_LENGTH]

all_docs = []
for d in DOC_LENGTHS:
  preprocessor = PreProcessor(
      split_by='word',
      split_length=d,
      split_overlap=d//10,

      clean_empty_lines=True,
      clean_whitespace=True,
      clean_header_footer=True,
      split_respect_sentence_boundary=True,
      progress_bar=True, 
      add_page_number=True
  )
  docs = preprocessor.process(fmt_chapters)
  print(f"{d} document length into {len(docs)} documents")
  all_docs.extend(docs)
print(f"We will be working with {len(all_docs)} documents from {len(fmt_chapters)} chapters")

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable  HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocessing:   0%|          | 0/465 [00:00<?, ?docs/s]



150 document length into 35874 documents
We will be working with 35874 documents from 465 chapters


In [10]:
# TODO Modify this so that it can resume from an existing db
document_store = FAISSDocumentStore(
    faiss_index_factory_str="Flat", 
    similarity="cosine",
    sql_url = f"sqlite:///{DOCUMENT_STORE_NAME_OUT}_sqldb.db",
    # faiss_index = "pale.db",
    )
document_store.write_documents(docs,batch_size=5000)

Writing Documents:   0%|          | 0/35874 [00:00<?, ?it/s]

In [11]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model=MODEL_NAME,
    model_format="sentence_transformers",
    max_seq_len=500,
    progress_bar=True,
)


document_store.update_embeddings(retriever,update_existing_embeddings=False, batch_size=5000)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model TheSpaceManG/wildbow-distilbert


Downloading (…)2f043/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8f53f2f043/README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading (…)53f2f043/config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)2f043/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading (…)8f53f2f043/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)3f2f043/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
INFO:haystack.document_stores.faiss:Updating embeddings for 35874 docs...


Updating Embedding:   0%|          | 0/35874 [00:00<?, ? docs/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [12]:
document_store.save(
    index_path=f'{DOCUMENT_STORE_NAME_OUT}.index',
    config_path=f'{DOCUMENT_STORE_NAME_OUT}.json'
)

In [13]:
document_store.get_embedding_count() # Should be the same length as our docs

35874

In [14]:
new_doc_store = FAISSDocumentStore.load(
    index_path=f'{DOCUMENT_STORE_NAME_OUT}.index',
    config_path=f'{DOCUMENT_STORE_NAME_OUT}.json'
    )
new_doc_store.get_embedding_count() # Should be the same as above

35874