In [None]:

from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

# from langchain.chat_models import ChatAnthropic
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import json
from random import seed,shuffle
from time import sleep

In [None]:
import pandas as pd


In [None]:
from ragchat.html_cleaner import HtmlCleaner
from ragchat.html_patterns import HtmlPatterns
from ragchat.text_embedder import TextEmbedder


In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)


In [None]:
cleaner = HtmlCleaner(parser="inscriptis", debug=True, max_pages=None)
clean_text_dict = cleaner.get_clean_text_dict()


In [None]:
questions = {}
prompt = ChatPromptTemplate.from_template(
    "Generate a list of 10 hypothetical questions that the below document could be used to answer:\n\n{doc}"
)
chain = {"doc": RunnablePassthrough()} | prompt | llm | StrOutputParser()
i = 0
print("# of docs: ", len(clean_text_dict), ". progress: ", end="")
for pg, text_result in clean_text_dict.items():
    q = chain.invoke(TextEmbedder.add_title(text_result))
    q_list = re.split(r"\n[0-9]+\.\s", q)
    if q_list[0][:3] == "1. ":
        q_list[0] = q_list[0][3:]
    questions[pg] = q_list
    i += 1
    if len(clean_text_dict) % i == 20:
        print(i, end=", ")


In [None]:
if not os.path.exists("questions"):
    os.makedirs("questions")
    with open("questions/synthetic_questions.json", "w") as f:
        json.dump(questions, f)


In [None]:
question_sources = {}
for pg, q_list in questions.items():
    for q in q_list:
        if q not in question_sources:
            question_sources[q] = []
        question_sources[q].append(pg)


In [None]:
# look for duplicate questions
(len(question_sources), len([q for q_list in questions.values() for q in q_list]))


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")


In [None]:
docs = []
for q in question_sources.keys():
    docs.append(
        Document(
            page_content=q,
            metadata={
                "page_list": question_sources[q],
            },
        )
    )


In [None]:
vector_store = None
n = len(docs)
ch_sz = 100
for ch_i in range(-(-n // ch_sz)):
    start = ch_i * ch_sz
    stop = start + ch_sz
    if vector_store is None:
        vector_store = FAISS.from_documents(docs[start:stop], embeddings)
    else:
        sleep(0.5)
        vector_store.add_documents(docs[start:stop])


In [None]:
vector_store.save_local("questions/questions_FAISS_index")
